Setup

Load packages

if ( !require("pacman", quietly = TRUE) )
    install.packages("pacman")

pacman::p_load(
  tidyverse, #Used for basic data handling and visualization.  
  RColorBrewer, #Color palettes for data visualization. 
  gridExtra, #Used to arrange multiple ggplots in a grid.  
  grid, #Used to arrange multiple ggplots in a grid.
  overviewR, #Used to assess missing data.  
  rnaturalearth, #Used to extract geographical data to create maps. 
  sf, #Used together with the prior package to create map. 
  plotly, #Used together with prior two packages to create map.  
  table1, #Used to add labels to variables and to create table of descriptive characteristics of patients.
  flextable, #Used to export tables.  
  officer,  #Used to export tables.
  mgcv, #Used to model non-linear relationships with a general additive model.  
  ggmosaic, #Used to create mosaic plots.   
  car, #Used to visualize distribution of continuous variables (stacked Q-Q plots).
  simpleboot, #Used to calculate mean atelectasis coverage and confidence intervals trhough bootstrapping. 
  boot, #Used to calculate mean atelectasis coverage and confidence intervals trhough bootstrapping.
  dagitty, #Used in conjunction with https://www.dagitty.net/ to create directed acyclic graph to inform statistical modelling.
  lavaan, #Used to create correlation matrix to assess conditional independencies.  
  broom, #Used to exponentiate coefficients of regression models. 
  sandwich, #Used to calculate robust standard errors for prevalence ratios.  
  ordinal, #Used to model ordinal outcome (atelectasis percent) and to test proportional odds assumptions.  
  VGAM, #Used to model partial proportional odds model.   
  gt, #Used to present a summary of the results of regression models.   
  gtsummary, #Used to create table to summarize regression models.  
  gratia #Used together with gglopt2 to create smooth partial effects plot from gam models. 
)

Set seed (needed for reproducibility of bootstrapping) as the current year 2023:

set.seed(2023)
Log session info
# Credits for this chunk of code: Alex Bossers, Utrecht University (a.bossers@uu.nl)  

# remove clutter
session <- sessionInfo()
session$BLAS <- NULL; session$LAPACK <- NULL; session$loadedOnly <- NULL;
# write log file
writeLines(capture.output( print( session, locale=FALSE ) ), 
           paste0(lubridate::today(),"_sessionInfo_prevalencia_atelectasias.txt") )
# also show in notebook.
session 
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 22621)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Spanish_Mexico.utf8  LC_CTYPE=Spanish_Mexico.utf8   
## [3] LC_MONETARY=Spanish_Mexico.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=Spanish_Mexico.utf8    
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
##  [1] splines   stats4    grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] gratia_0.8.1        gtsummary_1.7.2     gt_0.10.0          
##  [4] VGAM_1.1-9          ordinal_2022.11-16  sandwich_3.0-2     
##  [7] broom_1.0.5         lavaan_0.6-16       dagitty_0.3-1      
## [10] boot_1.3-28.1       simpleboot_1.1-7    car_3.1-2          
## [13] carData_3.0-5       ggmosaic_0.3.3      mgcv_1.9-0         
## [16] nlme_3.1-163        officer_0.6.3       flextable_0.9.4    
## [19] table1_1.4.3        plotly_4.10.3       sf_1.0-14          
## [22] rnaturalearth_0.3.4 overviewR_0.0.13    gridExtra_2.3      
## [25] RColorBrewer_1.1-3  lubridate_1.9.3     forcats_1.0.0      
## [28] stringr_1.5.0       dplyr_1.1.3         purrr_1.0.2        
## [31] readr_2.1.4         tidyr_1.3.0         tibble_3.2.1       
## [34] ggplot2_3.4.4       tidyverse_2.0.0     pacman_0.5.1

Set working directory and create sub-folders

figfolder   <- "01_output_figures"
tabfolder   <- "01_output_tables"
psfolder    <- "02_ps_subsets"
dir.create( figfolder, showWarnings=FALSE )
dir.create( tabfolder, showWarnings=FALSE )
dir.create( psfolder, showWarnings=FALSE )

Load dataset

data <- read.csv("atelectasis_prevalence.csv", na.strings="NA", row.names = NULL)
str(data)
## 'data.frame':    243 obs. of  42 variables:
##  $ ID                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age                 : int  41 50 45 35 28 52 46 56 59 57 ...
##  $ sex                 : int  1 1 1 1 1 1 1 1 2 1 ...
##  $ weight              : num  116 116 146 101 141 ...
##  $ height              : num  1.62 1.71 1.67 1.48 1.72 1.7 1.75 1.63 1.78 1.62 ...
##  $ BMI                 : num  44.1 39.5 52.2 46.2 47.7 ...
##  $ type_obesity        : int  3 2 3 3 3 1 2 3 3 1 ...
##  $ ARISCAT             : int  23 26 23 23 39 26 23 26 26 26 ...
##  $ ARISCAT_group       : int  1 2 1 1 2 2 1 2 2 2 ...
##  $ ASA                 : int  2 2 3 2 2 2 2 2 2 NA ...
##  $ spo2_VPO            : int  97 94 93 92 90 96 96 96 97 93 ...
##  $ surgical_procedure  : int  2 1 1 1 1 1 1 1 1 4 ...
##  $ CORADS              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ atelectasis         : int  0 1 1 1 1 0 0 0 0 1 ...
##  $ atelectasis_location: int  NA 1 1 1 2 NA NA NA NA 1 ...
##  $ atelectasis_percent : num  0 2.5 7.5 15 7.5 0 0 0 0 5 ...
##  $ hb                  : num  13.9 14.2 14.2 14.8 14.1 13.8 16.4 14.1 14.2 13.9 ...
##  $ hct                 : num  41.4 41.8 41.7 43.6 42.2 39.9 46.2 41.9 42.2 39.9 ...
##  $ leu                 : num  11.6 5.4 8.6 9.4 7.7 9.8 5.9 10.5 9.5 6.1 ...
##  $ neu_percent         : int  60 62 58 63 68 60 59 67 62 53 ...
##  $ neu_absolute        : num  6.96 3.35 4.99 5.92 5.24 ...
##  $ linf_percent        : int  39 33 38 35 31 38 38 31 36 44 ...
##  $ linf_absolute       : num  4.52 1.78 3.27 3.29 2.39 ...
##  $ mon_percent         : int  1 4 3 1 1 2 2 2 1 2 ...
##  $ mon_absolute        : num  4.52 1.78 3.27 3.29 2.39 ...
##  $ platelets           : int  364 217 218 318 332 392 268 387 322 225 ...
##  $ glucose             : int  74 89 74 76 80 90 121 84 80 98 ...
##  $ urea                : int  21 26 27 14 18 15 28 22 21 32 ...
##  $ creatinine          : num  0.88 0.82 0.64 0.74 0.72 0.6 0.77 0.6 0.71 0.59 ...
##  $ rapid_covid_test    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PCR_covid           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ surgery_performed   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ state_residence     : chr  "Texas" "Texas" "Texas" "Arkansas" ...
##  $ altitude            : int  519 519 519 198 885 1861 519 31 519 519 ...
##  $ hypertension        : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ diabetes            : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ sleep_apnea         : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ hypothyroidism      : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ dyslipidemia        : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ antidepressant_use  : int  0 1 0 0 1 1 0 0 0 0 ...
##  $ prior_covid19       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ other_comorb        : int  0 1 1 0 1 0 1 1 1 0 ...

Recode variables

table1::label(data$age) <- "Age"
units(data$age) <- "years"

table1::label(data$sex) <- "Sex"
data$sex <- factor(data$sex, 
                   levels=c(1,2), 
                   labels=c("Woman", "Man")
                   )


table1::label(data$weight) <- "Weight"
units(data$weight) <- "kilograms (kg)"

table1::label(data$height) <- "Height"
units(data$height) <- "meters (m)"

table1::label(data$type_obesity) <- "Obesity class"
data$type_obesity <- factor(data$type_obesity, 
                            levels=c(1,2,3), 
                            labels=c("Class 1 Obesity",
                                     "Class 2 Obesity",
                                     "Class 3 Obesity")
                            ) 

table1::label(data$ARISCAT) <- "ARISCAT"
units(data$ARISCAT) <- "score"

table1::label(data$ARISCAT_group) <- "ARISCAT risk group"
data$ARISCAT_group <- factor(data$ARISCAT_group,
                             levels=c(1,2,3), 
                             labels=c("Low Risk",
                                      "Intermediate Risk",
                                      "High Risk")
                             )


table1::label(data$ASA) <- "ASA physical status"
data$ASA <- factor(data$ASA, 
                   levels=c(1,2,3), 
                   labels=c("ASA 1", "ASA 2", "ASA 3")
                   )

table1::label(data$spo2_VPO) <- "Oxygen saturation (SpO2)"
units(data$spo2_VPO) <- "%"

table1::label(data$surgical_procedure) <- "Surgical procedure"
data$surgical_procedure <- factor(data$surgical_procedure,
                                  levels=c(1,2,3,4),
                                  labels=c("SG", # 'SG' = Sleeve Gastrectomy
                                           "RYGB", # 'RYGB' = Roux-en-Y gastric bypass
                                           "OAGB", # 'OAGB' = One anastomosis gastric bypass
                                           "LBGS") # 'LBGS' = lap-band to sleeve
                                  )

table1::label(data$CORADS) <- "CO-RADS"
data$CORADS <- factor(data$CORADS,
                      levels=c(1,2,3,4),
                      labels=c("CO-RADS 1",
                               "CO-RADS 2",
                               "CO-RADS 3",
                               "CO-RADS 4"),
                      ordered = TRUE
                      )

table1::label(data$atelectasis) <- "Atelectasis"
data$atelectasis <- factor(data$atelectasis,
                           levels=c(0,1),
                           labels=c("No", "Yes")
                           ) %>% 
  relevel("Yes","No") #Relevel done for convenience when visualizing 2x2 tables.

table1::label(data$atelectasis_location) <- "Atelectasis location"
data$atelectasis_location <- factor(data$atelectasis_location,
                                    levels=c(1,2),
                                    labels=c("Unilateral", "Bilateral")
                                    )

table1::label(data$atelectasis_percent) <- "Percentage of atelectasis"
units(data$atelectasis_percent) <- "%"

table1::label(data$hb) <- "Hemoglobin"
units(data$hb) <- "g/dL"

table1::label(data$hct) <- "Hematocrit"
units(data$hct) <- "%"

table1::label(data$leu) <- "WBC count"
units(data$leu) <- "10³/µL"

table1::label(data$neu_percent) <- "Neutrophils (percent)"
units(data$neu_percent) <- "%"

table1::label(data$neu_absolute) <- "Neutrophils (absolute)"
units(data$neu_absolute) <- "10³/µL"

table1::label(data$linf_percent) <- "Lymphocytes (percent)"
units(data$linf_percent) <- "%"

table1::label(data$linf_absolute) <- "Lymphocytes (absolute)"
units(data$linf_absolute) <- "10³/µL"

table1::label(data$mon_percent) <- "Monocytes (percent)"
units(data$mon_percent) <- "%"

table1::label(data$mon_absolute) <- "Monocytes (absolute)"
units(data$mon_absolute) <- "10³/µL"

table1::label(data$platelets) <- "Platelets"
units(data$platelets) <- "cells/µL"

table1::label(data$glucose) <- "Glucose"
units(data$glucose) <- "mg/dL"

table1::label(data$urea) <- "Urea"
units(data$urea) <- "mg/dL"

table1::label(data$creatinine) <- "Creatinine"
units(data$creatinine) <- "mg/dL"

data$rapid_covid_test <- as.factor(data$rapid_covid_test) %>% 
  fct_recode(negative = "0") 

data$PCR_covid <- as.factor(data$PCR_covid) %>% fct_recode(negative = "0") 

table1::label(data$state_residence) <- "State of residence"
data$state_residence <- as.factor(data$state_residence)

data$altitude <- as.numeric(data$altitude)
table1::label(data$altitude) <- "Mean altitude"
units(data$altitude) <- "meters"

table1::label(data$surgery_performed) <- "Surgery performed"
data$surgery_performed <- factor(data$surgery_performed,
                                 levels=c(0,1),
                                 labels=c("No", "Yes")
                                 ) 

table1::label(data$hypertension) <- "Hypertension"
data$hypertension <- factor(data$hypertension, 
                            levels=c(0,1), 
                            labels=c("No", "Yes")
                            ) 

table1::label(data$diabetes) <- "Diabetes"
data$diabetes <- factor(data$diabetes, 
                        levels=c(0,1), 
                        labels=c("No", "Yes")
                        )

table1::label(data$sleep_apnea) <- "Obstructive sleep apnea"
data$sleep_apnea <- factor(data$sleep_apnea,
                           levels=c(0,1),
                           labels=c("No", "Yes")
                           ) 

table1::label(data$hypothyroidism) <- "Hypothyroidism"
data$hypothyroidism <- factor(data$hypothyroidism, 
                              levels=c(0,1),
                              labels=c("No", "Yes")
                              )

table1::label(data$dyslipidemia) <- "Dyslipidemia"
data$dyslipidemia <- factor(data$dyslipidemia,
                            levels=c(0,1),
                            labels=c("No", "Yes")
                            )  

table1::label(data$antidepressant_use) <- "Antidepressants use"
data$antidepressant_use <- factor(data$antidepressant_use,
                                  levels=c(0,1),
                                  labels=c("No", "Yes")
                                  )

table1::label(data$prior_covid19) <- "Prior COVID-19"
data$prior_covid19 <- factor(data$prior_covid19, 
                             levels=c(0,1), 
                             labels=c("No", "Yes")
                             )

table1::label(data$other_comorb) <- "Other comorbidities"
data$other_comorb <- factor(data$other_comorb, 
                            levels=c(0,1), 
                            labels=c("No", "Yes")
                            )

str(data)

General overview

summary(data)
##        ID             age           sex          weight          height     
##  Min.   :  1.0   Min.   :20.00   Woman:221   Min.   : 73.8   Min.   :1.480  
##  1st Qu.: 61.5   1st Qu.:33.00   Man  : 22   1st Qu.: 97.6   1st Qu.:1.620  
##  Median :122.0   Median :40.00               Median :111.2   Median :1.660  
##  Mean   :122.0   Mean   :40.37               Mean   :115.9   Mean   :1.670  
##  3rd Qu.:182.5   3rd Qu.:49.00               3rd Qu.:129.7   3rd Qu.:1.715  
##  Max.   :243.0   Max.   :65.00               Max.   :264.6   Max.   :1.910  
##                                                                             
##       BMI                 type_obesity    ARISCAT               ARISCAT_group
##  Min.   :30.00   Class 1 Obesity: 62   Min.   :23.0   Low Risk         :178  
##  1st Qu.:34.81   Class 2 Obesity: 55   1st Qu.:23.0   Intermediate Risk: 65  
##  Median :40.31   Class 3 Obesity:126   Median :23.0   High Risk        :  0  
##  Mean   :41.51                         Mean   :24.6                          
##  3rd Qu.:46.15                         3rd Qu.:26.0                          
##  Max.   :77.31                         Max.   :42.0                          
##                                                                              
##     ASA         spo2_VPO     surgical_procedure       CORADS    atelectasis
##  ASA 1: 52   Min.   :88.00   SG  :192           CO-RADS 1:233   Yes: 81    
##  ASA 2:149   1st Qu.:93.00   RYGB:  7           CO-RADS 2:  6   No :162    
##  ASA 3: 33   Median :96.00   OAGB:  5           CO-RADS 3:  2              
##  NA's :  9   Mean   :94.93   LBGS: 31           CO-RADS 4:  2              
##              3rd Qu.:97.00   NA's:  8                                      
##              Max.   :99.00                                                 
##                                                                            
##  atelectasis_location atelectasis_percent       hb             hct       
##  Unilateral: 55       Min.   : 0.000      Min.   : 9.90   Min.   :30.30  
##  Bilateral : 26       1st Qu.: 0.000      1st Qu.:13.90   1st Qu.:40.52  
##  NA's      :162       Median : 0.000      Median :14.45   Median :42.60  
##                       Mean   : 2.778      Mean   :14.52   Mean   :42.68  
##                       3rd Qu.: 5.000      3rd Qu.:15.20   3rd Qu.:44.67  
##                       Max.   :27.500      Max.   :18.50   Max.   :52.90  
##                                           NA's   :5       NA's   :5      
##       leu          neu_percent     neu_absolute     linf_percent  
##  Min.   : 3.100   Min.   :32.00   Min.   : 1.600   Min.   :14.00  
##  1st Qu.: 6.600   1st Qu.:59.00   1st Qu.: 3.969   1st Qu.:30.00  
##  Median : 7.700   Median :63.00   Median : 4.883   Median :35.00  
##  Mean   : 7.826   Mean   :62.94   Mean   : 4.956   Mean   :34.87  
##  3rd Qu.: 8.900   3rd Qu.:68.00   3rd Qu.: 5.894   3rd Qu.:39.00  
##  Max.   :13.300   Max.   :84.00   Max.   :10.773   Max.   :66.00  
##  NA's   :5        NA's   :5       NA's   :5        NA's   :5      
##  linf_absolute    mon_percent     mon_absolute     platelets    
##  Min.   :0.616   Min.   :0.000   Min.   :0.616   Min.   :154.0  
##  1st Qu.:2.244   1st Qu.:1.000   1st Qu.:2.244   1st Qu.:272.2  
##  Median :2.587   Median :2.000   Median :2.587   Median :316.0  
##  Mean   :2.703   Mean   :1.626   Mean   :2.703   Mean   :317.2  
##  3rd Qu.:3.095   3rd Qu.:2.000   3rd Qu.:3.095   3rd Qu.:354.8  
##  Max.   :5.676   Max.   :4.000   Max.   :5.676   Max.   :495.0  
##  NA's   :5       NA's   :5       NA's   :5       NA's   :5      
##     glucose            urea        creatinine     rapid_covid_test
##  Min.   : 59.00   Min.   :10.0   Min.   :0.5000   negative:243    
##  1st Qu.: 74.00   1st Qu.:17.0   1st Qu.:0.6600                   
##  Median : 83.00   Median :20.5   Median :0.7500                   
##  Mean   : 85.61   Mean   :21.4   Mean   :0.7577                   
##  3rd Qu.: 92.00   3rd Qu.:26.0   3rd Qu.:0.8400                   
##  Max.   :200.00   Max.   :49.0   Max.   :1.5900                   
##  NA's   :5        NA's   :5      NA's   :5                        
##     PCR_covid   surgery_performed   state_residence    altitude     
##  negative:  3   No : 10           Texas     :87     Min.   :  31.0  
##  NA's    :240   Yes:233           Washington:39     1st Qu.: 519.0  
##                                   Utah      :27     Median : 519.0  
##                                   Alberta   :22     Mean   : 650.1  
##                                   Florida   :20     3rd Qu.: 806.0  
##                                   California:16     Max.   :1861.0  
##                                   (Other)   :32                     
##  hypertension diabetes  sleep_apnea hypothyroidism dyslipidemia
##  No :179      No :218   No :224     No :219        No :224     
##  Yes: 64      Yes: 25   Yes: 19     Yes: 24        Yes: 19     
##                                                                
##                                                                
##                                                                
##                                                                
##                                                                
##  antidepressant_use prior_covid19 other_comorb
##  No :146            No :239       No :143     
##  Yes: 97            Yes:  4       Yes:100     
##                                               
##                                               
##                                               
##                                               
## 
Exclude participants with CO-RADS ≥ 3:

Participants with higher probability of having a current diagnosis of COVID-19 are expected to have chest CT alterations due to COVID-19 pneumonia. Thus, will be excluded.

Number of patients according to CO-RADS:

summary(data$CORADS)
## CO-RADS 1 CO-RADS 2 CO-RADS 3 CO-RADS 4 
##       233         6         2         2
count(data)
##     n
## 1 243
data <- data %>% filter(as.numeric(CORADS) < 3) %>% droplevels(data$CORADS)
count(data)
##     n
## 1 239
Exclude participants with prior COVID-19:

Since prior COVID-19 is considered a confounder and since there are only 3 participants with prior COVID-19 which would provide difficult to assess the role of prior COVID-19 in analyses, participants with prior COVID-19 were excluded from the analysis.

count(data)
##     n
## 1 239
data <- data %>% filter(prior_covid19 == "No") %>% droplevels(data$prior_covid19)
count(data)
##     n
## 1 236

Will remove prior_covid19 column as it no longer provides information of a varying characteristic. Similarly, rapid_covid_test does not provide additional information.

length(data)
## [1] 42
data <- data %>% select(-c(prior_covid19, rapid_covid_test))
length(data)
## [1] 40

Missing data per variable

overview_na(data)

Missing data for PCR_covid and is explained since only patients who decided to have a test performed on their own will reported the result. The medical center did not require a negative PCR test at that time during the pandemic, reason why PCR tests were not systematically performed. As shown in earlier summary of variables, all available tests (n=3) were negative. This variable will not be analyzed further downstream:

data <- data %>% select(-PCR_covid)

The variable atelectasis_location has missing data since those patients who did not have atelectasis naturally do not have a location recorded. Will assess if data are missing for those who had atelectasis:

data %>% filter(atelectasis == "Yes") %>% 
  group_by(atelectasis_location) %>% 
  overview_na()

There is no missing data for atelectasis_location after sub-setting only those who had atelectasis.

Lastly, I will subset all participants without the prior variable to further assess the extent of missing data for other variables:

data %>% select(-c(atelectasis_location)) %>% overview_na()

Missing data constitutes less than 5% for all remaining variables. Thus, will proceed with complete-case analysis without performing any data imputation procedure.

State of residence of participants.

table <- data %>% 
  group_by(state_residence) %>% 
  summarize(n=n()) %>% 
  mutate(name=state_residence) %>% 
  select(-state_residence) %>% 
  relocate(name) %>% 
  arrange(desc(n))

table
## # A tibble: 10 × 2
##    name           n
##    <fct>      <int>
##  1 Texas         81
##  2 Washington    39
##  3 Utah          27
##  4 Alberta       21
##  5 Florida       20
##  6 California    16
##  7 Louisiana     16
##  8 Arkansas       7
##  9 Missouri       5
## 10 Nevada         4

The following two chunks of code were adapted from contribution by cpsievert.

# us laea projection as used in albersusa::us_laea_proj
us_laea <- "+proj=laea +lat_0=45 +lon_0=-100 +x_0=0 +y_0=0 +a=6370997 +b=6370997 +units=m +no_defs"

usa <- ne_states(country = "united states of america", returnclass = "sf") %>%
  st_transform(us_laea) %>%
  select(name)

canada <- ne_states(country = "canada", returnclass = "sf") %>%
  st_transform(us_laea) %>%
  select(name)

usa <- left_join(usa, table, by="name") 
canada <- left_join(canada, table, by="name") 

Supplementary Figure S2

Create map with number of patients per state in Canada and the USA.

figS2 <- plot_ly(split = ~name, color = ~n, colors = "Blues",
                 stroke = I("black"), span = I(1),
                 height = 450,
                 width = 800) %>%
  add_sf(data = usa, color = I("dimgray")) %>%
  add_sf(data = canada, color = I("dimgray")) %>%
  add_sf(data = filter(canada, name %in% table$name)) %>%
  add_sf(data = filter(usa, name %in% table$name)) %>%
  layout(showlegend = FALSE) %>%
  colorbar(title = "Number of<br>patients")

Create a table with the absolute frequency of patients per state to complement visualization.

table <- table %>% rename(State=name)

fig_add <- plot_ly(type="table",
                   domain = list(x=c(0,0.25), y=c(0,0.6)),
                   header=list(values=names(table)), 
                   cells=list(values=unname(table)))

Combine the prior two elements in one single figure:

fig <- subplot(fig_add, figS2,
               nrows=1,
               margin=1) 
fig

Distribution of numerical variables

Due to the large number of figures produced, I will not show these in the rendered HTML document. Plots were created with this code:

for(variable in names(data)) {
if(is.numeric(data[[variable]])) {
        
  name <- variable
        
  Main <- paste("Histogram of", variable, sep=" ")
  hist(data[[variable]], xlab=name, main=Main, 
       col = "steelblue")
        
  Main <- paste("Boxplot of", variable, sep=" ")
  boxplot(data[[variable]], xlab=name, main=Main, 
          horizontal=TRUE, 
          col = "steelblue")
        
  Main <- paste("Normal Q-Q Plot of", variable, sep=" ")
  qqnorm(data[[variable]], main = Main); 
  qqline(data[[variable]], col = "steelblue",lwd = 2)
  
  }
}

Near normal distribution:
- Age: light tails
- height: heavy right tail, 4 outliers right
- hb: heavy tails, bilateral outliers
- hct: heavy tails, bilateral outliers
- leu: near normal, bilateral outliers
- neu_absolute: heavy right tail, two right outliers
- linf_absolute: heavy right tail, bilateral outliers (more right)
- mon_absolute: heavy right tail, bilateral outliers (more right)
- platelets: two right outliers
- urea: four right outliers
- creatinine: three right outliers

Distribution not normal:
- Weight: right-skewed, outliers are verified observations of extreme weight. - BMI: right-skewed, outliers are verified observations of extreme BMI.
- spo2_VPO: Left-skewed
- neu_percent: left-skewed
- linf_percent: right-skewed
- glucose: right-skewed
- mon_percent: observations around only 5 data points. Will not use this variable, only absolute monocytes will be used.
- altitude: distribution not clear as values are quite apart an concentrate around single states with diferring mean altitudes. Perhaps best to try to use smooth term later?

Outcome variable:
- atelectasis_percent: Zero-inflated. Would be difficult to manage as categorical ordinal due to low number of patients in some categories. Will re-assess later and decide.

Group variables by distribution, then apply statement to use output to render table 1:

normal <- c("age",
            "height",
            "hb",
            "hct",
            "leu",
            "neu_absolute",
            "linf_absolute",
            "mon_absolute",
            "platelets",
            "urea",
            "creatinine"
            )
nonormal <- c("weight"
              ,"BMI",
              "spo2_VPO",
              "neu_percent",
              "linf_percent",
              "glucose",
              "atelectasis_percent",
              "mon_percent",
              "altitude"
              )

for(name in normal) cat(name,"=", switch(EXPR = name, '"Mean (SD)",'),"\n")
## age = "Mean (SD)", 
## height = "Mean (SD)", 
## hb = "Mean (SD)", 
## hct = "Mean (SD)", 
## leu = "Mean (SD)", 
## neu_absolute = "Mean (SD)", 
## linf_absolute = "Mean (SD)", 
## mon_absolute = "Mean (SD)", 
## platelets = "Mean (SD)", 
## urea = "Mean (SD)", 
## creatinine = "Mean (SD)",
for(name in nonormal) cat(name,"=", switch(EXPR = name, '"Median [Q1, Q3]",'),"\n")
## weight = "Median [Q1, Q3]", 
## BMI = "Median [Q1, Q3]", 
## spo2_VPO = "Median [Q1, Q3]", 
## neu_percent = "Median [Q1, Q3]", 
## linf_percent = "Median [Q1, Q3]", 
## glucose = "Median [Q1, Q3]", 
## atelectasis_percent = "Median [Q1, Q3]", 
## mon_percent = "Median [Q1, Q3]", 
## altitude = "Median [Q1, Q3]",

Table 1

Log abbreviations

abbreviations = sort(c("body-mass index (BMI)", 
                       "peripheral saturation of oxygen (SpO2)",
                       "COVID-19 Reporting and Data System (CO-RADS)",
                       "sleeve gastrectomy (SG)",
                       "roux-en-Y gastric bypass (RYGB)",
                       "one anastomosis gastric bypass (OAGB)",
                       "lap-band to gastric sleeve (LBGS)",
                       "coronavirus disease (COVID-19)",
                       "Assess Respiratory Risk in Surgical Patients in Catalonia (ARISCAT)",
                       "white blood cell (WBC)")
                     )
abbreviations_stats = sort(c("standard deviation (SD)",
                             "interquartile range (IQR)",
                             "percentage (%)",
                             "25th percentile (Q1)",
                             "75th percentile (Q3)")
                           )

Generate Table 1 with render function

rndr <- function(x, name, ...) {
    if (!is.numeric(x)) return(render.categorical.default(x))
    what <- switch(name,
                   age = "Mean (SD)", 
                   height = "Mean (SD)", 
                   hb = "Mean (SD)", 
                   hct = "Mean (SD)", 
                   leu = "Mean (SD)", 
                   neu_absolute = "Mean (SD)", 
                   linf_absolute = "Mean (SD)", 
                   mon_absolute = "Mean (SD)", 
                   platelets = "Mean (SD)", 
                   urea = "Mean (SD)", 
                   creatinine = "Mean (SD)", 
                   weight = "Median [Q1, Q3]", 
                   BMI = "Median [Q1, Q3]", 
                   spo2_VPO = "Median [Q1, Q3]", 
                   neu_percent = "Median [Q1, Q3]", 
                   linf_percent = "Median [Q1, Q3]", 
                   glucose = "Median [Q1, Q3]", 
                   atelectasis_percent = "Median [Q1, Q3]", 
                   mon_percent = "Median [Q1, Q3]", 
                   altitude = "Median [Q1, Q3]"
                   ) 
    parse.abbrev.render.code(c("", what))(x)
}

table1 <- table1(~ sex + age + weight + height + BMI + surgical_procedure + ARISCAT_group + spo2_VPO + altitude + hypertension + diabetes + sleep_apnea + hypothyroidism + dyslipidemia + antidepressant_use + CORADS + glucose + creatinine + urea + hb + hct + leu + neu_absolute + linf_absolute + mon_absolute + platelets | type_obesity, 
       data=data,
       overall=c(left="Total"), 
       render=rndr,
       caption="Clinical characteristics of patients, according to obesity class.", 
       footnote=c(abbreviations,abbreviations_stats)
       )
table1
Clinical characteristics of patients, according to obesity class.
Total
(N=236)
Class 1 Obesity
(N=62)
Class 2 Obesity
(N=53)
Class 3 Obesity
(N=121)

Assess Respiratory Risk in Surgical Patients in Catalonia (ARISCAT)

body-mass index (BMI)

coronavirus disease (COVID-19)

COVID-19 Reporting and Data System (CO-RADS)

lap-band to gastric sleeve (LBGS)

one anastomosis gastric bypass (OAGB)

peripheral saturation of oxygen (SpO2)

roux-en-Y gastric bypass (RYGB)

sleeve gastrectomy (SG)

white blood cell (WBC)

25th percentile (Q1)

75th percentile (Q3)

interquartile range (IQR)

percentage (%)

standard deviation (SD)

sex
Woman 214 (90.7%) 60 (96.8%) 48 (90.6%) 106 (87.6%)
Man 22 (9.3%) 2 (3.2%) 5 (9.4%) 15 (12.4%)
Age (years)
Mean (SD) 40.3 (9.87) 42.1 (10.3) 40.8 (9.25) 39.1 (9.82)
Weight (kilograms (kg))
Median [Q1, Q3] 111 [97.4, 130] 88.8 [84.2, 95.7] 107 [102, 112] 128 [114, 142]
Height (meters (m))
Mean (SD) 1.67 (0.0822) 1.66 (0.0631) 1.69 (0.0856) 1.67 (0.0889)
BMI
Median [Q1, Q3] 40.3 [34.6, 46.0] 33.0 [31.5, 33.8] 38.3 [36.6, 39.1] 45.6 [42.2, 51.1]
surgical_procedure
SG 189 (80.1%) 52 (83.9%) 41 (77.4%) 96 (79.3%)
RYGB 6 (2.5%) 1 (1.6%) 1 (1.9%) 4 (3.3%)
OAGB 5 (2.1%) 1 (1.6%) 1 (1.9%) 3 (2.5%)
LBGS 31 (13.1%) 5 (8.1%) 9 (17.0%) 17 (14.0%)
ARISCAT_group
Low Risk 175 (74.2%) 44 (71.0%) 41 (77.4%) 90 (74.4%)
Intermediate Risk 61 (25.8%) 18 (29.0%) 12 (22.6%) 31 (25.6%)
Oxygen saturation (SpO2) (%)
Median [Q1, Q3] 96.0 [93.0, 97.0] 97.0 [95.0, 97.8] 96.0 [94.0, 97.0] 94.0 [92.0, 97.0]
Mean altitude (meters)
Median [Q1, Q3] 519 [519, 806] 519 [313, 806] 519 [519, 885] 519 [519, 806]
hypertension
No 177 (75.0%) 52 (83.9%) 40 (75.5%) 85 (70.2%)
Yes 59 (25.0%) 10 (16.1%) 13 (24.5%) 36 (29.8%)
diabetes
No 211 (89.4%) 58 (93.5%) 48 (90.6%) 105 (86.8%)
Yes 25 (10.6%) 4 (6.5%) 5 (9.4%) 16 (13.2%)
sleep_apnea
No 218 (92.4%) 60 (96.8%) 50 (94.3%) 108 (89.3%)
Yes 18 (7.6%) 2 (3.2%) 3 (5.7%) 13 (10.7%)
hypothyroidism
No 213 (90.3%) 55 (88.7%) 50 (94.3%) 108 (89.3%)
Yes 23 (9.7%) 7 (11.3%) 3 (5.7%) 13 (10.7%)
dyslipidemia
No 218 (92.4%) 58 (93.5%) 48 (90.6%) 112 (92.6%)
Yes 18 (7.6%) 4 (6.5%) 5 (9.4%) 9 (7.4%)
antidepressant_use
No 142 (60.2%) 36 (58.1%) 33 (62.3%) 73 (60.3%)
Yes 94 (39.8%) 26 (41.9%) 20 (37.7%) 48 (39.7%)
CORADS
CO-RADS 1 230 (97.5%) 61 (98.4%) 51 (96.2%) 118 (97.5%)
CO-RADS 2 6 (2.5%) 1 (1.6%) 2 (3.8%) 3 (2.5%)
Glucose (mg/dL)
Median [Q1, Q3] 83.0 [74.0, 92.0] 83.0 [77.0, 90.0] 81.0 [70.0, 92.0] 83.0 [74.0, 92.0]
Creatinine (mg/dL)
Mean (SD) 0.758 (0.146) 0.773 (0.115) 0.744 (0.144) 0.757 (0.160)
Urea (mg/dL)
Mean (SD) 21.4 (6.70) 22.9 (6.08) 20.5 (6.77) 21.1 (6.89)
Hemoglobin (g/dL)
Mean (SD) 14.5 (1.21) 14.5 (1.20) 14.5 (1.17) 14.6 (1.24)
Hematocrit (%)
Mean (SD) 42.8 (3.33) 42.6 (3.32) 42.6 (3.22) 42.9 (3.41)
WBC count (10³/µL)
Mean (SD) 7.83 (1.76) 7.81 (1.74) 7.71 (1.76) 7.89 (1.78)
Neutrophils (absolute) (10³/µL)
Mean (SD) 4.97 (1.42) 4.94 (1.39) 4.83 (1.39) 5.04 (1.46)
Lymphocytes (absolute) (10³/µL)
Mean (SD) 2.70 (0.811) 2.71 (0.802) 2.70 (0.920) 2.69 (0.771)
Monocytes (absolute) (10³/µL)
Mean (SD) 2.70 (0.811) 2.71 (0.802) 2.70 (0.920) 2.69 (0.771)
Platelets (cells/µL)
Mean (SD) 316 (64.4) 307 (67.6) 319 (63.2) 320 (63.2)

NOTE: The ASA physical status variable has not been included in analyses since the updated version of ASA consulted on October 2023 classifies obesity (30<BMI<40) as ASA 2 and obesity (BMI ≥40) as ASA 3. The distribution of frequencies of ASA~obesity class in this dataset does not match such definition. This occurred since an outdated version of ASA that did not include obesity was likely used by clinicians when writing the preoperative assessment medical note:

table(data$ASA,data$type_obesity)
##        
##         Class 1 Obesity Class 2 Obesity Class 3 Obesity
##   ASA 1              31              18               3
##   ASA 2              29              34              85
##   ASA 3               0               0              32

Save Table 1:

t1flex(table1) %>% save_as_html(path = paste(tabfolder,"/Table1.html",sep=""))

properties <- prop_section(
  page_size = page_size(orient = "landscape",
                        width = 8.3, height = 11.7),
  type = "continuous",
  page_margins = page_mar()
  )

t1flex(table1) %>% 
  save_as_docx(path = paste(tabfolder,"/Table1.docx",sep=""),
               pr_section = properties
               )

Assessment of independent variables

The selection of variables that will be assessed is according to the following directed acyclic graph which will be used again before statistical modelling, to assess conditional independencies.

DAG

DAG <- dagitty( 'dag {
bb="-3.687,-2.637,5.96,2.575"
age [adjusted,pos="-0.929,-0.900"]
altitude_cat [pos="4.259,0.627"]
atelectasis_percent [outcome,pos="-0.258,-2.137"]
hb [pos="-0.789,2.123"]
sex [adjusted,pos="-2.995,-0.744"]
sleep_apnea [adjusted,pos="0.108,0.783"]
spo2_VPO [outcome,pos="2.079,0.078"]
type_obesity [exposure,pos="-2.883,0.520"]
age -> atelectasis_percent
age -> hb
age -> sleep_apnea
age -> spo2_VPO
age -> type_obesity
altitude_cat -> atelectasis_percent
altitude_cat -> hb
altitude_cat -> sleep_apnea
altitude_cat -> spo2_VPO
atelectasis_percent -> hb
atelectasis_percent -> spo2_VPO
hb -> spo2_VPO
sex -> atelectasis_percent
sex -> hb
sex -> sleep_apnea
sex -> spo2_VPO
sex -> type_obesity
sleep_apnea -> atelectasis_percent
sleep_apnea -> spo2_VPO
sleep_apnea -> type_obesity
type_obesity -> atelectasis_percent
type_obesity -> spo2_VPO
}
' )

plot(DAG)

Other variables that are potential confounders are not shown in this DAG since they were addressed by design in this study as follows:
- Current COVID-19: Exclusion criteria were applied to n=2 patients with CO-RADS 3 and n=2 with CO-RADS 4. Only participants with low probability of COVID-19 (CO-RADS 1 and 2) were included in this study.
- Prior COVID-19: This was an exclusion criterion (n=3).
- Bronchiectasis: This was an exclusion criterion (n=0).

Description of independent variables

Age

summary(age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   32.75   40.00   40.26   48.25   65.00

Distribution of age was assessed earlier.

The mean age was 40.3 (SD: 9.87).

Sex

tab <- table(sex)
tab
## sex
## Woman   Man 
##   214    22
prop <- round(prop.table(tab)*100,1)
prop
## sex
## Woman   Man 
##  90.7   9.3

Most patients in the sample were woman (n=214, 90.7%).

Body mass index (BMI)

summary(BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30.00   34.63   40.30   41.37   46.02   77.31
tab <- table(type_obesity)
tab
## type_obesity
## Class 1 Obesity Class 2 Obesity Class 3 Obesity 
##              62              53             121
prop <- round(prop.table(tab)*100,1)
prop
## type_obesity
## Class 1 Obesity Class 2 Obesity Class 3 Obesity 
##            26.3            22.5            51.3

Distribution of BMI was assessed earlier. It is right-skewed due to extreme values (verified outliers). The WHO classification of BMI for obesity class will be used to complement descriptions and for potential use later during statistical modelling.

The median BMI was 40.295 (IQR: 34.63- 46.02). The distribution of BMI was right-skewed due to extreme BMI values (range: 30- 77.31). Most patients were in the class 3 obesity category (n=121, 51.3%), followed by class 1 (n=62, 26.3%) and 2 (n=53, 22.5%).

SpO2

summary(spo2_VPO)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      88      93      96      95      97      99

Distribution of SpO2 during the pre-anesthetic assessment was presented earlier. It is left-skewed due to some participants exhibiting decreased SpO2. Will categorize according to clinical categories to assess the proportion of patients with decreased SpO2.

Proportion of patients with decreased SpO2

First, create SpO2 breaks:

data <- data %>% 
  mutate(spo2_cat = cut(spo2_VPO,
                        breaks=c(87,90,94,100),
                        right=TRUE,
                        labels=c("≤90","90 to 94",">94")
                        )
         )
tab <- table(spo2_cat)
tab
## spo2_cat
##      ≤90 90 to 94      >94 
##       15       75      146
prop <- round(prop.table(tab)*100,1)
prop
## spo2_cat
##      ≤90 90 to 94      >94 
##      6.4     31.8     61.9

The median SpO2 during the pre-anethetic assessment was 96 (IQR: 93-97) %, with a minimum value of 88%. Of these, n=146 (61.9%) had normal SpO2 (above 94%), whereas n=75 (31.8%) had a value in the 90-94% range, and n=15 (6.4%) had ≤90%.

Obstructive sleep apnea

tab <- table(sleep_apnea)
tab
## sleep_apnea
##  No Yes 
## 218  18
prop <- round(prop.table(tab)*100,1)
prop
## sleep_apnea
##   No  Yes 
## 92.4  7.6

Patients with a diagnosis of OSA were 7.6% (n=18) of the sample.

Altitude

summary(altitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    31.0   519.0   519.0   652.7   806.0  1861.0

Distribution of altitude was assessed earlier. Cannot assume normal distribution. Thus, I will create a new variable categorizing values according to the study by Crocker ME, et al.

data <- data %>% 
  mutate(altitude_cat = cut(altitude,
                            breaks=c(0,1000,2500),
                            right=FALSE,
                            labels=c("Low altitude","Moderate altitude")
                            )
         )
tab <- table(altitude_cat)
tab
## altitude_cat
##      Low altitude Moderate altitude 
##               205                31
round(prop.table(tab)*100,1)
## altitude_cat
##      Low altitude Moderate altitude 
##              86.9              13.1

Hemoglobin

summary(hb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    9.90   13.90   14.50   14.54   15.20   18.50       2

Distribution of hemoglobin was assessed earlier. Two participants don’t have a hemoglobin value.

Relationships between independent variables

Characteristics of participants according to BMI class are shown in Table 1.

BMI and SpO2

Scatterplot

plot(spo2_VPO~BMI, 
     main="Scatterplot", 
     xlab="Body mass index (kg/m²)", 
     ylab="SpO2 (%)"
     )

Relationship does not seem to be linear (also, variables were not normally distributed, with outliers), but suggests a negative correlation. Will assess if a smooth BMI term explains SpO2 better, and if so, what is the best number of knots to model this relationship:

GAM model with k=2:

model<-gam(spo2_VPO~s(BMI,k=2))
AIC_k2 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 2)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1436   661.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 1.923  1.994 61.61  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.349   Deviance explained = 35.4%
## GCV = 4.9287  Scale est. = 4.8677    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  2 1.923156 1.035767   0.665
plot(model)

GAM model with k=4:

model<-gam(spo2_VPO~s(BMI,k=4))
AIC_k4 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   94.996      0.141   673.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 2.835   2.98 47.86  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.372   Deviance explained =   38%
## GCV = 4.7712  Scale est. = 4.6937    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  3 2.834552 1.074122   0.845
plot(model)

GAM model with k=6:

model<-gam(spo2_VPO~s(BMI,k=6))
AIC_k6 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 6)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1396   680.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 4.149  4.678 32.02  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.385   Deviance explained = 39.6%
## GCV = 4.7022  Scale est. = 4.5996    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  5 4.148805 1.102576  0.9425
plot(model)

GAM model with k=8:

model<-gam(spo2_VPO~s(BMI,k=8))
AIC_k8 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 8)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1394   681.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 4.723  5.629 26.62  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.387   Deviance explained = 39.9%
## GCV = 4.7001  Scale est. = 4.5862    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  7 4.723071 1.108757   0.975
plot(model)

GAM model with k=12:

model<-gam(spo2_VPO~s(BMI,k=12))
AIC_k12 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 12)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1395   680.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 4.616  5.685 26.16  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.385   Deviance explained = 39.8%
## GCV = 4.7067  Scale est. = 4.5947    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI) 11 4.615804 1.106134   0.955
plot(model)

Best AIC:

list(AIC_k2,AIC_k4,AIC_k6,AIC_k8,AIC_k12)
## [[1]]
## [1] 1048.14
## 
## [[2]]
## [1] 1040.448
## 
## [[3]]
## [1] 1036.959
## 
## [[4]]
## [1] 1036.83
## 
## [[5]]
## [1] 1037.165

All models are significantly better than linear. Thus, using a smooth term for BMI is better than modelling a linear relationship.

Regarding AIC, the models with k>6 are not better at explaining the variance. Will try a model with k=5 since the best model is expected to be anywhere between k=4 and k=6:

GAM model with k=5 (4 degrees of freedom):

model<-gam(spo2_VPO~s(BMI,k=5))
AIC_k5 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1399   679.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.685  3.941 37.81  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.382   Deviance explained = 39.2%
## GCV = 4.7121  Scale est. = 4.6185    n = 236
k.check(model)
##        k'      edf k-index p-value
## s(BMI)  4 3.685135 1.09624  0.9325
plot(model)

Best AIC:

list(AIC_k4,AIC_k5,AIC_k6)
## [[1]]
## [1] 1040.448
## 
## [[2]]
## [1] 1037.475
## 
## [[3]]
## [1] 1036.959

Model with k=5 still offers and advantage compared to k=4 (drop in AIC). No other improvements in k-index or visual representation are achieved with higher k. Thus, will use k=5 to model.

fig1b <- ggplot(data, aes(BMI,spo2_VPO)) + 
  geom_point(size=0.6,color="gray60") +
  geom_smooth(method="gam",formula = y ~ s(x, bs = "cs", k = 5), color="cadetblue4")  + 
  labs(y="SpO2 (%)",x="Body mass index (kg/m²)",tag="B") + 
  ylim(83,100) + xlim(30,80) +
  theme_bw() + 
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )
fig1b

Negative non-monotonic relationship since SpO2 decreases, but then seems to increase slightly again at BMI 40, followed by a marked decrease as BMI decreases at values higher than ~42.

Spearman’s correlation coefficient shouldn’t be used due to relationship not being monotonically decreasing. However, I will calculate it just to have a rough idea (but will not report this in the paper).

spearman <- cor.test(spo2_VPO,BMI,method="spearman",exact=FALSE)
spearman
## 
##  Spearman's rank correlation rho
## 
## data:  spo2_VPO and BMI
## S = 3105183, p-value = 2.28e-11
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## -0.417458
Figure 1b
ggsave("Figure1b.png",plot=fig1b,
       path=figfolder, 
       width = 8,  height = 6, units = "in", dpi = 300
       )

BMI exhibited a negative non-linear monotonic relationship with SpO2 (Figure 1B, rho= -0.417, p<0.001).

BMI and age

Scatterplot

plot(age~BMI, 
     main="Scatterplot", 
     ylab="Age (years)", 
     xlab="Body mass index (BMI)"
     )

Datapoints scattered. Relationship probably linear, but there are influential true outliers with extreme BMI. Will assess with Spearman correlation analysis due to extreme BMI values.

spearman <- cor.test(age,BMI,method="spearman",exact=FALSE)
spearman
## 
##  Spearman's rank correlation rho
## 
## data:  age and BMI
## S = 2530759, p-value = 0.017
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1552445

Age had a weak negative correlation with BMI (rho= -0.155, p=0.017).

BMI and sex

data_BMI <- data %>% group_by(sex) 

median_bmi <- data_BMI %>% 
  summarize(n = n(), 
            BMI_median = median(BMI), 
            Q1 = quantile(BMI,0.25), 
            Q3 = quantile(BMI,0.75), 
            min = min(BMI), 
            max = max(BMI)
            )
median_bmi
## # A tibble: 2 × 7
##   sex       n BMI_median    Q1    Q3   min   max
##   <fct> <int>      <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Woman   214       39.8  34.5  45.5  30    74.9
## 2 Man      22       43.0  37.9  46.2  30.8  77.3
boxplot(BMI ~ sex)

Distribution not normal and influential outliers. Will assess non-parametrically.

wil <- wilcox.test(BMI ~ sex, data = data_BMI, exact = FALSE)
wil
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  BMI by sex
## W = 1918.5, p-value = 0.1537
## alternative hypothesis: true location shift is not equal to 0

The median BMI was not different between men (43, IQR: 37.9-46.2) and women (39.9, IQR: 34.5-45.5) (p=0.154).

BMI and sleep apnea

boxplot(BMI ~ sleep_apnea)

Distribution not normal and influential outliers. Will assess non-parametrically.

data_BMI <- data %>% group_by(sleep_apnea) 

median_bmi <- data_BMI %>% 
  summarize(n = n(),
            BMI_median = median(BMI),
            Q1 = quantile(BMI,0.25),
            Q3 = quantile(BMI,0.75),
            min = min(BMI),
            max = max(BMI)
            )
median_bmi
## # A tibble: 2 × 7
##   sleep_apnea     n BMI_median    Q1    Q3   min   max
##   <fct>       <int>      <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 No            218       39.8  34.5  45.1  30    74.9
## 2 Yes            18       44.0  40.1  49.2  32.4  77.3
wil <- wilcox.test(BMI ~ sleep_apnea, data = data_BMI, exact = FALSE)
wil
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  BMI by sleep_apnea
## W = 1274, p-value = 0.01353
## alternative hypothesis: true location shift is not equal to 0

The median BMI was significantly higher in participants with sleep apnea (44, IQR: 40.1-49.2) compared to those without OSA (39.8, IQR: 34.5-45.1) (p=0.014).

Age and SpO2

Scatterplot

plot(spo2_VPO~age, 
     main="Scatterplot",
     xlab="Age",
     ylab="SpO2 (%)"
     )

Do not seem to be correlated. Will apply Spearman’s correlation test:

spearman <- cor.test(spo2_VPO,age,method="spearman",exact=FALSE)
spearman
## 
##  Spearman's rank correlation rho
## 
## data:  spo2_VPO and age
## S = 2143192, p-value = 0.7405
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.02167287

Age and SpO2 were not correlated (rho= 0.022, p=0.74).

Age and sex

boxplot(age~sex)

Assess distribution of age by sex:

data_age <- data %>% group_by(sex)

ggplot(data_age,aes(x = age)) +
  geom_histogram(fill = "firebrick3", colour = "black") +
  facet_grid(sex ~ .)

qqPlot(age ~ sex, data=data_age)

Distribution near-normal, but light tails for women. However, t-test could be robust to deviations from normality and differences in group size. Will assess mean and variance for further testing:

mean_age <- data_age %>% 
  summarise(n=n(),
            age_mean = mean(age),
            sd = sd(age),
            variance = var(age)
            )
mean_age
## # A tibble: 2 × 5
##   sex       n age_mean    sd variance
##   <fct> <int>    <dbl> <dbl>    <dbl>
## 1 Woman   214     40.2  9.94     98.9
## 2 Man      22     40.6  9.28     86.1

Variances are similar. However, group sizes differ my 10x. Welch’s t-test more suitable:

t_test <- t.test(age ~ sex, data = data_age)
t_test
## 
##  Welch Two Sample t-test
## 
## data:  age by sex
## t = -0.19917, df = 26.213, p-value = 0.8437
## alternative hypothesis: true difference in means between group Woman and group Man is not equal to 0
## 95 percent confidence interval:
##  -4.715913  3.882438
## sample estimates:
## mean in group Woman   mean in group Man 
##            40.21963            40.63636

Mean age was similar bethween men (40.6, sd:9.3) and women (40.2, sd:9.9) (p=0.844).

Age and sleep apnea

data_age <- data %>% group_by(sleep_apnea)

ggplot(data_age, aes(x = age, fill=sleep_apnea)) +
  geom_histogram(position = "identity", alpha = 0.4)

qqPlot(age ~ sleep_apnea)

Distribution near-normal. Will assess mean and variance for further testing.

mean_age <- data_age %>% 
  summarise(n=n(), 
            age_mean = mean(age), 
            sd = sd(age), 
            variance = var(age)
            )
mean_age
## # A tibble: 2 × 5
##   sleep_apnea     n age_mean    sd variance
##   <fct>       <int>    <dbl> <dbl>    <dbl>
## 1 No            218     40.2 10.0     100. 
## 2 Yes            18     41.4  8.19     67.1

Size per group very different, variances do not look similar. Welch’s t-test more suitable:

t_test <- t.test(age ~ sleep_apnea, data = data_age)
t_test
## 
##  Welch Two Sample t-test
## 
## data:  age by sleep_apnea
## t = -0.59817, df = 21.42, p-value = 0.556
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -5.473186  3.025683
## sample estimates:
##  mean in group No mean in group Yes 
##          40.16514          41.38889

Age was not significantly different between participants with OSA (41.4, sd:8.2) and those without (40.2, sd:10) (p=0.556).

SpO2 and sex

boxplot(spo2_VPO ~ sex)

data_spo2 <- data %>% group_by(sex)

ggplot(data_spo2, aes(x = spo2_VPO, fill=sex)) +
  geom_histogram(position = "identity", alpha = 0.4)

qqPlot(spo2_VPO ~ sex)

Distribution deviates from normal and small group size for men. Will assess non-parametrically.

median_spo2 <- data_spo2 %>% 
  summarize(n = n(),
            spo2_median = median(spo2_VPO), 
            Q1 = quantile(spo2_VPO,0.25), 
            Q3 = quantile(spo2_VPO,0.75), 
            min = min(spo2_VPO), 
            max = max(spo2_VPO)
            )
median_spo2
## # A tibble: 2 × 7
##   sex       n spo2_median    Q1    Q3   min   max
##   <fct> <int>       <dbl> <dbl> <dbl> <int> <int>
## 1 Woman   214        96      93  97      88    99
## 2 Man      22        94.5    92  97.8    88    98
wil <- wilcox.test(spo2_VPO ~ sex, data = data_spo2, exact = FALSE)
wil
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  spo2_VPO by sex
## W = 2602, p-value = 0.413
## alternative hypothesis: true location shift is not equal to 0

The median SpO2 was not different between men (94.5, IQR: 92-97.8) and women (96, IQR: 93-97) (p=0.413).

SpO2 and sleep apnea

data_spo2 <- data %>% group_by(sleep_apnea)

ggplot(data_spo2, aes(x = spo2_VPO, fill=sleep_apnea)) +
  geom_histogram(position = "identity", alpha = 0.4)

qqPlot(spo2_VPO ~ sleep_apnea)

boxplot(spo2_VPO ~ sleep_apnea)

Distribution not normal, and smaller group size for those with sleep apnea. Will assess non-parametrically.

median_spo2 <- data_spo2 %>% 
  summarize(n = n(), 
            spo2_median = median(spo2_VPO), 
            Q1 = quantile(spo2_VPO,0.25), 
            Q3 = quantile(spo2_VPO,0.75), 
            min = min(spo2_VPO), 
            max = max(spo2_VPO)
            )
median_spo2
## # A tibble: 2 × 7
##   sleep_apnea     n spo2_median    Q1    Q3   min   max
##   <fct>       <int>       <dbl> <dbl> <dbl> <int> <int>
## 1 No            218          96    93    97    88    99
## 2 Yes            18          92    91    93    88    94
wil <- wilcox.test(spo2_VPO ~ sleep_apnea, data = data_spo2, exact = FALSE)
wil
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  spo2_VPO by sleep_apnea
## W = 3350, p-value = 4.973e-07
## alternative hypothesis: true location shift is not equal to 0

Patients with sleep apnea had a lower median SpO2 (92, IQR: 91-93) than those without OSA (96, IQR: 93-97) (p<0.001).

SpO2 and altitude

Scatterplot

plot(spo2_VPO~altitude, data=data, 
     main="Scatterplot", 
     xlab="Mean altitude (meters)", 
     ylab="SpO2 (%)"
     )
abline(lm(spo2_VPO~altitude),col="steelblue")

There does not seem to be a pattern.

Would a smooth term be useful to model altitude?

ggplot(data, aes(altitude,spo2_VPO)) + 
  geom_point(size=0.6,color="gray40") + 
  geom_smooth(method="loess", color="cadetblue4") +
  ylab("SpO2 (%)") + xlab("Mean altitude (meters)") + 
  ylim(85,100) + xlim(0,2000) +
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

It is likely that a smooth term for SpO2 would be non-informative since there is no clear reasonable pattern in this smooth plot. Additionally, it is well known that any impacts in SpO2 due to altitudes up to 2000 are very limited (i.e 1 to 2 units). REF.

I will still check if a smooth term may be better than linear in case that adjustment for this variable is needed.

GAM model with k=2:

model<-gam(spo2_VPO~s(altitude,k=2))
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(altitude, k = 2)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1779   533.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F p-value
## s(altitude) 1.542   1.79 0.519    0.58
## 
## R-sq.(adj) =  0.000785   Deviance explained = 0.734%
## GCV = 7.5521  Scale est. = 7.4707    n = 236
k.check(model)
##             k'      edf  k-index p-value
## s(altitude)  2 1.542225 1.018301    0.61
plot(model)

model<-gam(spo2_VPO~s(altitude,k=4))
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(altitude, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   94.996      0.178   533.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F p-value
## s(altitude) 1.505  1.798 0.437   0.631
## 
## R-sq.(adj) =  0.000124   Deviance explained = 0.653%
## GCV = 7.5559  Scale est. = 7.4757    n = 236
k.check(model)
##             k'      edf  k-index p-value
## s(altitude)  3 1.504893 1.017622  0.5925
plot(model)

Smooth term is not significantly better than one assuming linearity. Furthermore, the relationship with SpO2 in smooth term does not make any sense (i.e., according to prior reference, SpO2 should decrease at higher altitudes). Thus, it would be very likely that including this term would only explain noise in any case, not the true known causal relationship between SpO2 and altitude.

Lastly, will check the pattern according to altitude categories, which may be a better term to use in models in any case.

boxplot(spo2_VPO ~ altitude_cat)

data_spo2 <- data %>% group_by(altitude_cat)

ggplot(data_spo2, aes(x = spo2_VPO, fill=altitude_cat)) +
  geom_histogram(position = "identity", alpha = 0.4)

qqPlot(spo2_VPO ~ altitude_cat)

Distribution deviates from normal and small group size for the moderate altitude group. Will assess non-parametrically.

median_spo2 <- data_spo2 %>% 
  summarize(n = n(),
            spo2_median = median(spo2_VPO),
            Q1 = quantile(spo2_VPO,0.25), 
            Q3 = quantile(spo2_VPO,0.75), 
            min = min(spo2_VPO), 
            max = max(spo2_VPO)
            )
median_spo2
## # A tibble: 2 × 7
##   altitude_cat          n spo2_median    Q1    Q3   min   max
##   <fct>             <int>       <int> <dbl> <dbl> <int> <int>
## 1 Low altitude        205          96    93    97    88    99
## 2 Moderate altitude    31          96    92    97    88    99
wil <- wilcox.test(spo2_VPO ~ altitude_cat, data = data_spo2, exact = FALSE)
wil
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  spo2_VPO by altitude_cat
## W = 3360, p-value = 0.6043
## alternative hypothesis: true location shift is not equal to 0

The median SpO2 was not different between low and moderate altitude categories (p=0.604).

SpO2 and hemoglobin

summary(hb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    9.90   13.90   14.50   14.54   15.20   18.50       2

Scatterplot

plot(spo2_VPO~hb, data=data, 
     main="Scatterplot", 
     xlab="Hemoglobin (g/dL)", 
     ylab="SpO2 (%)"
     )
abline(lm(spo2_VPO~altitude),col="red")

There does not seem to be a clear pattern.

Would a smooth term be useful to model SpO2?

ggplot(data, aes(hb,spo2_VPO)) + 
  geom_point(size=0.6,color="gray40") + 
  geom_smooth(method="loess", color="red") +
  ylab("SpO2 (%)") + xlab("Hemoglobin (g/dL)") + 
  ylim(85,100) +
  theme_bw() + 
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

Hemoglobin likely has an effect on SpO2 at lower hemoglobin values, which makes sense with what is observed in the graph. Assuming a linear relationship could lead to incorrect conclusions according to this. Nonetheless, it looks like the apparent non-linear relationship at low Hb values is due to only 2 observations with wide confidence intervals showing that the true slope could go either up, straight or down, so it may also be incorrect to assume a non-linear relationship based only on this plot. I will model to see if there is an optimal smooth term for hemoglobin or if a linear term best fits the data:

GAM model with k=2:

model<-gam(spo2_VPO~s(hb,k=2))
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(hb, k = 2)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9829     0.1789   530.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##       edf Ref.df     F p-value
## s(hb)   1      1 0.272   0.603
## 
## R-sq.(adj) =  -0.00314   Deviance explained = 0.117%
## GCV = 7.5555  Scale est. = 7.4909    n = 234
k.check(model)
##       k' edf   k-index p-value
## s(hb)  2   1 0.9302737    0.13
plot(model)

GAM model with k=4:

model<-gam(spo2_VPO~s(hb,k=4))
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(hb, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9829     0.1789   530.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##       edf Ref.df     F p-value
## s(hb)   1      1 0.272   0.603
## 
## R-sq.(adj) =  -0.00314   Deviance explained = 0.117%
## GCV = 7.5555  Scale est. = 7.4909    n = 234
k.check(model)
##       k' edf   k-index p-value
## s(hb)  3   1 0.9302737  0.1375
plot(model)

The estimated degrees of freedom (edf) in both cases were 1, plus p=0.6, meaning that a linear term is better fitted to this data than a non-linear term.

spearman <- cor.test(spo2_VPO,hb,method="spearman",exact=FALSE)
spearman
## 
##  Spearman's rank correlation rho
## 
## data:  spo2_VPO and hb
## S = 2274841, p-value = 0.3201
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.06527711

SpO2 and hemoglobin were not correlated (rho= -0.065, p=0.32).

Sex and sleep apnea

Mean expected frequency:

mean_exp <- data %>% 
  drop_na(sex,sleep_apnea) %>%
  summarize(mean_expected_freq = n()/(nlevels(sex)*nlevels(sleep_apnea)))

mean_exp
##   mean_expected_freq
## 1                 59

Since value is grater than 5.0, chi-squared without continuity correction is appropriate.

freq <- table(sex, sleep_apnea)
freq
##        sleep_apnea
## sex      No Yes
##   Woman 204  10
##   Man    14   8
round(prop.table(freq),4)
##        sleep_apnea
## sex         No    Yes
##   Woman 0.8644 0.0424
##   Man   0.0593 0.0339

Mosaic Plot

ggplot(data = data) +
  geom_mosaic(aes(x = product(sex,sleep_apnea), fill=sex),na.rm = TRUE) +
  scale_fill_manual(values=c("peachpuff","sandybrown")) +
  theme_mosaic() 

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 28.437, df = 1, p-value = 9.68e-08

Percentage by level (women, men):

round(prop.table(freq,1),4)*100
##        sleep_apnea
## sex        No   Yes
##   Woman 95.33  4.67
##   Man   63.64 36.36

Sex was associated with OSA (p<0.001) as men had the diagnosis more frequently compared to women.

Outcome variable

Corroborate that atelectasis(Yes/No) matches atelectasis percent equal or different to 0%:

table(atelectasis,atelectasis_percent)
##            atelectasis_percent
## atelectasis   0 2.5   5 7.5  10 12.5  15 17.5 27.5
##         Yes   0  11  14  33   6    1   4    7    1
##         No  159   0   0   0   0    0   0    0    0

Yes, these do match.

Prevalence of atelectasis

freq <- table(atelectasis)
percent <- round((prop.table(freq)*100),2)
total <- rbind(freq,percent)
total
##           Yes     No
## freq    77.00 159.00
## percent 32.63  67.37

Prevalence of atelectasis with 95% confidence interval

prev_atelec <- prop.test(freq, correct=FALSE)
confint <- round((prev_atelec$conf.int*100),2)
prev_atelec
## 
##  1-sample proportions test without continuity correction
## 
## data:  freq, null probability 0.5
## X-squared = 28.492, df = 1, p-value = 9.411e-08
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2696526 0.3884549
## sample estimates:
##         p 
## 0.3262712

The prevalence of atelectasis was 32.63 (95%CI: 26.97, 38.85).

Atelectasis - obesity class

Mean expected frequency:

mean_exp <- data %>% 
  drop_na(type_obesity, atelectasis) %>% 
  summarize(mean_expected_freq = n()/(nlevels(type_obesity)*nlevels(atelectasis)))

mean_exp
##   mean_expected_freq
## 1           39.33333
freq <- table(type_obesity, atelectasis)
freq
##                  atelectasis
## type_obesity      Yes No
##   Class 1 Obesity   8 54
##   Class 2 Obesity  15 38
##   Class 3 Obesity  54 67
round(prop.table(freq),4)
##                  atelectasis
## type_obesity         Yes     No
##   Class 1 Obesity 0.0339 0.2288
##   Class 2 Obesity 0.0636 0.1610
##   Class 3 Obesity 0.2288 0.2839

Mosaic Plot

data %>% 
  mutate(atelectasis = fct_relevel(atelectasis, "No", "Yes")) %>%
ggplot() +
  geom_mosaic(aes(x = product(atelectasis,type_obesity), fill=atelectasis),na.rm = TRUE) +
  scale_fill_manual(values=c("grey95","lightsteelblue4")) +
  theme_mosaic() + 
  theme(axis.text.x=element_text(size=rel(0.8)))

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 19.352, df = 2, p-value = 6.279e-05

Atelectasis location by obesity class

Mean expected frequency:

mean_exp <- data %>% 
  drop_na(type_obesity, atelectasis_location) %>% 
  summarize(mean_expected_freq = n()/(nlevels(type_obesity)*nlevels(atelectasis_location)))

mean_exp
##   mean_expected_freq
## 1           12.83333

Mean expected frequency is greater than 5.0, so chi-squared without continuity correction is adequate.

freq <- table(type_obesity, atelectasis_location)
freq
##                  atelectasis_location
## type_obesity      Unilateral Bilateral
##   Class 1 Obesity          7         1
##   Class 2 Obesity         10         5
##   Class 3 Obesity         36        18
round(prop.table(freq),4)
##                  atelectasis_location
## type_obesity      Unilateral Bilateral
##   Class 1 Obesity     0.0909    0.0130
##   Class 2 Obesity     0.1299    0.0649
##   Class 3 Obesity     0.4675    0.2338

Mosaic Plot

ggplot(data = data) +
  geom_mosaic(aes(x = product(type_obesity,atelectasis_location),
                  fill=type_obesity),na.rm = TRUE) +
  scale_fill_manual(values=c("seagreen1","seagreen3","seagreen4")) +
  theme_mosaic() 

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 1.4503, df = 2, p-value = 0.4843

Prevalence of atelectasis with 95% confidence intervals

#This is the prevalence of atelectasis for the total sample: 
atelectasis_total <- data %>% group_by(atelectasis) %>% 
  summarise(n = n()) %>%  
  mutate(prev = n / sum(n)*100, 
         lower = lapply(n, prop.test, n = sum(n)), 
         upper = sapply(lower, function(x) x$conf.int[2])*100, 
         lower = sapply(lower, function(x) x$conf.int[1])*100,
         type_obesity = "Total") %>% 
  mutate_at(3:5, round, 2)
atelectasis_total$confint <- paste(atelectasis_total$lower, "-", atelectasis_total$upper)
atelectasis_total <- atelectasis_total %>% select(-c(lower,upper))

#This is the prevalence of atelectasis by obesity class: 
atelectasis_obesity <- data %>% group_by(type_obesity, atelectasis) %>% 
  summarise(n = n()) %>%  
  mutate(prev = n / sum(n)*100, 
         lower = lapply(n, prop.test, n = sum(n)), 
         upper = sapply(lower, function(x) x$conf.int[2])*100, 
         lower = sapply(lower, function(x) x$conf.int[1])*100) %>% 
  mutate_at(4:6, round, 2)
atelectasis_obesity$confint <- paste(atelectasis_obesity$lower, "-", atelectasis_obesity$upper)
atelectasis_obesity <- atelectasis_obesity %>% select(-c(lower,upper))

#This is just to combine the two prior tables:   
atelectasis <- atelectasis_total %>% 
  bind_rows(atelectasis_obesity) %>% 
  relocate(type_obesity, .before = n)

atelectasis
## # A tibble: 8 × 5
##   atelectasis type_obesity        n  prev confint      
##   <fct>       <chr>           <int> <dbl> <chr>        
## 1 Yes         Total              77  32.6 26.77 - 39.06
## 2 No          Total             159  67.4 60.94 - 73.23
## 3 Yes         Class 1 Obesity     8  12.9 6.13 - 24.4  
## 4 No          Class 1 Obesity    54  87.1 75.6 - 93.87 
## 5 Yes         Class 2 Obesity    15  28.3 17.2 - 42.56 
## 6 No          Class 2 Obesity    38  71.7 57.44 - 82.8 
## 7 Yes         Class 3 Obesity    54  44.6 35.68 - 53.92
## 8 No          Class 3 Obesity    67  55.4 46.08 - 64.32

Location of atelectasis (Unilateral/Bilateral)

#This is the location of atelectasis for the total sample:   
atelectasis_total_location <- data %>% 
  filter(atelectasis=="Yes") %>% 
  group_by(atelectasis_location) %>% 
  summarise(Freq = n()) %>%  
  mutate(Percentage = (round((Freq/sum(Freq)*100),digits=2)), 
         type_obesity = "Total"
         )
atelectasis_total_location$sumpercent <- paste(atelectasis_total_location$Freq," (", atelectasis_total_location$Percentage, "%)")
atelectasis_total_location <- atelectasis_total_location %>% 
  select(-c(Freq,Percentage)) %>% 
  pivot_wider(names_from = atelectasis_location,values_from = sumpercent)

#This is the location by obesity class:  
atelectasis_obesity_location <- data %>% 
  filter(atelectasis=="Yes") %>% 
  group_by(type_obesity, atelectasis_location) %>% 
  summarise(Freq = n()) %>%  
  mutate(Percentage = (round((Freq/sum(Freq)*100),digits=2)))
atelectasis_obesity_location$sumpercent <- paste(atelectasis_obesity_location$Freq," (", atelectasis_obesity_location$Percentage, "%)")
atelectasis_obesity_location <- atelectasis_obesity_location %>% select(-c(Freq,Percentage)) %>% pivot_wider(names_from = atelectasis_location,values_from = sumpercent)

# This is to bind both of the location results in a single table
location <- atelectasis_total_location %>%
  bind_rows(atelectasis_obesity_location)

location
## # A tibble: 4 × 3
##   type_obesity    Unilateral     Bilateral     
##   <chr>           <chr>          <chr>         
## 1 Total           53  ( 68.83 %) 24  ( 31.17 %)
## 2 Class 1 Obesity 7  ( 87.5 %)   1  ( 12.5 %)  
## 3 Class 2 Obesity 10  ( 66.67 %) 5  ( 33.33 %) 
## 4 Class 3 Obesity 36  ( 66.67 %) 18  ( 33.33 %)

The prevalence of atelectasis was greater in higher obesity classes: class 1, n=8 (12.9%, 95%CI:6.13 - 24.4); class 2, n=15 (28.3%, 95%CI:17.2 - 42.56); and class 3, n=54 (44.63%, 95%CI:35.68 - 53.92) (p<0.001).

Of those who had atelectasis, the most frequent presentation was unilateral n=53 ( 68.83 %), compared to bilateral n=24 ( 31.17 %). When examining this by obesity class, the observed distribution was not significantly different for those with class 1, 2, and 3 obesity categories (n=7 ( 87.5 %), n=10 ( 66.67 %), and n=36 ( 66.67 %), respectively) (p=0.484).

Atelectasis Percent

Assess distribution if assumed to be a numeric variable:

data %>% 
  mutate(atelectasis_percent = as.numeric(atelectasis_percent)) %>%
  ggplot(aes(x = atelectasis_percent)) +
  geom_histogram(colour = "black") +
  facet_grid(type_obesity ~ .)

Distribution excluding 0:

data %>% 
  mutate(atelectasis_percent = as.numeric(atelectasis_percent)) %>% 
  filter(atelectasis_percent >0) %>% 
  ggplot(aes(x = atelectasis_percent)) +
  geom_histogram(colour = "black") +
  facet_grid(type_obesity ~ .)

Mean atelectasis percentage

The following would be the mean atelectasis percentage coverage if a normal distribution were assumed, which is what has been done in some prior studies:

data %>%  summarize(
    mean = mean(atelectasis_percent),
    sd = sd(atelectasis_percent)
  ) 
##       mean       sd
## 1 2.658898 4.687145

And by obesity class:

data %>% group_by(type_obesity) %>%
  summarize(
    mean = mean(atelectasis_percent),
    sd = sd(atelectasis_percent)
  ) 
## # A tibble: 3 × 3
##   type_obesity     mean    sd
##   <fct>           <dbl> <dbl>
## 1 Class 1 Obesity 0.927  2.91
## 2 Class 2 Obesity 1.56   3.15
## 3 Class 3 Obesity 4.03   5.52

As is evident from these numbers, assuming normality causes standard deviation to capture negative values, which is impossible in reality for this variable.

Thus, bootstrapping the mean and 95%CI is expected to lead to more appropriate estimates:

boot_atel <- one.boot(data$atelectasis_percent, mean, R=10000)
mean_boot <- mean(boot_atel$t)
mean_boot
## [1] 2.66296
boot_ci <- boot.ci(boot_atel)
boot_ci
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_atel)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 2.062,  3.247 )   ( 2.044,  3.231 )  
## 
## Level     Percentile            BCa          
## 95%   ( 2.087,  3.273 )   ( 2.087,  3.273 )  
## Calculations and Intervals on Original Scale

The bias-corrected and accelerated (BCa) bootstrap interval is known to lead to more stable intervals with better coverage. Will report this. However, it is a good thing that here 95%CI through different methods do not lead to widely different results.

Now, I will calculate this for different BMI categories:

data_class_1 <- data %>% filter(type_obesity=="Class 1 Obesity") 
data_class_2 <- data %>% filter(type_obesity=="Class 2 Obesity") 
data_class_3 <- data %>% filter(type_obesity=="Class 3 Obesity") 

Class 1:

boot_class1 <- one.boot(data_class_1$atelectasis_percent, mean, R=10000)
mean_boot_class1 <- mean(boot_class1$t)
mean_boot_class1
## [1] 0.9250605
boot_ci_class1 <- boot.ci(boot_class1)
boot_ci_class1
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_class1)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 0.2140,  1.6456 )   ( 0.1210,  1.5323 )  
## 
## Level     Percentile            BCa          
## 95%   ( 0.3226,  1.7339 )   ( 0.3226,  1.7339 )  
## Calculations and Intervals on Original Scale

Class 2:

boot_class2 <- one.boot(data_class_2$atelectasis_percent, mean, R=10000)
mean_boot_class2 <- mean(boot_class2$t)
mean_boot_class2
## [1] 1.553594
boot_ci_class2 <- boot.ci(boot_class2)
boot_ci_class2
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_class2)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 0.732,  2.388 )   ( 0.708,  2.311 )  
## 
## Level     Percentile            BCa          
## 95%   ( 0.802,  2.406 )   ( 0.802,  2.406 )  
## Calculations and Intervals on Original Scale

Class 3:

boot_class3 <- one.boot(data_class_3$atelectasis_percent, mean, R=10000)
mean_boot_class3 <- mean(boot_class3$t)
mean_boot_class3
## [1] 4.036407
boot_ci_class3 <- boot.ci(boot_class3)
boot_ci_class3
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_class3)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 3.036,  5.007 )   ( 3.017,  4.959 )  
## 
## Level     Percentile            BCa          
## 95%   ( 3.099,  5.041 )   ( 3.099,  5.041 )  
## Calculations and Intervals on Original Scale

The mean atelectasis percentage coverage in the sample was 2.66% (95%CI:2.09-3.27) and according to obesity categories: class 1 (0.93%, 95%CI:0.32-1.73), class 2 (1.55%, 95%CI:0.8-2.41), and class 3 (4.04%, 95%CI:3.1-5.04).

Atelectasis percentage by obesity class

Now, I will continue assessing atelectasis percentage if assumed to be categorical ordinal:

Mean expected frequency:

mean_exp <- data %>% mutate(atelectasis_percent=factor(atelectasis_percent)) %>% summarize(mean_expected_freq = n()/(nlevels(type_obesity)*nlevels(atelectasis_percent)))
mean_exp
##   mean_expected_freq
## 1           8.740741

Mean expected frequency is greater than 5.0, so chi-squared without continuity correction is adequate.

tab <- table(atelectasis_percent,type_obesity)
tab
##                    type_obesity
## atelectasis_percent Class 1 Obesity Class 2 Obesity Class 3 Obesity
##                0                 54              38              67
##                2.5                2               7               2
##                5                  1               2              11
##                7.5                4               4              25
##                10                 0               1               5
##                12.5               0               0               1
##                15                 0               1               3
##                17.5               1               0               6
##                27.5               0               0               1
prop.table(tab)
##                    type_obesity
## atelectasis_percent Class 1 Obesity Class 2 Obesity Class 3 Obesity
##                0        0.228813559     0.161016949     0.283898305
##                2.5      0.008474576     0.029661017     0.008474576
##                5        0.004237288     0.008474576     0.046610169
##                7.5      0.016949153     0.016949153     0.105932203
##                10       0.000000000     0.004237288     0.021186441
##                12.5     0.000000000     0.000000000     0.004237288
##                15       0.000000000     0.004237288     0.012711864
##                17.5     0.004237288     0.000000000     0.025423729
##                27.5     0.000000000     0.000000000     0.004237288
prop_fig2a <- prop.table(tab,margin=2)
prop_fig2a
##                    type_obesity
## atelectasis_percent Class 1 Obesity Class 2 Obesity Class 3 Obesity
##                0        0.870967742     0.716981132     0.553719008
##                2.5      0.032258065     0.132075472     0.016528926
##                5        0.016129032     0.037735849     0.090909091
##                7.5      0.064516129     0.075471698     0.206611570
##                10       0.000000000     0.018867925     0.041322314
##                12.5     0.000000000     0.000000000     0.008264463
##                15       0.000000000     0.018867925     0.024793388
##                17.5     0.016129032     0.000000000     0.049586777
##                27.5     0.000000000     0.000000000     0.008264463
barplot(tab,beside=TRUE)

chi <- chisq.test(tab, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 39.434, df = 16, p-value = 0.0009412
Figure 2a
barplot(prop_fig2a,beside=TRUE,ylim=c(0,1),ylab="Relative frequency",
        col=brewer.pal(9,"Blues"),
        legend.text=c("0%","2.5%","5%","7.5%","10%","12.5%","15%","17.5%","27.5%"),
        space = c(0.2, 1.5)
        )

Figure2a <- recordPlot()
png(filename=paste(figfolder,"/Figure2a.png",sep=""),width=8, height=5, units="in", res=300)
Figure2a
dev.off
Smooth term?

Scatterplot

plot(atelectasis_percent~BMI, data=data, main="Scatterplot", xlab="Body mass index (kg/m²)", ylab="Atelectasis percent (%)")
abline(lm(atelectasis_percent~BMI),col="darkblue")

Atelectasis percent seems to increase as BMI increases. However, relationship is not linear.

Would a smooth term be more useful to model SpO2?

ggplot(data, aes(BMI,atelectasis_percent)) + 
  geom_point(size=0.6,color="gray40") + 
  geom_smooth(method="loess", color="darkblue") +
  ylab("Atelectasis percent (%)") + xlab("Body mass index (kg/m²)")  +
  theme_bw() + 
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)),
        axis.text.y = element_text(size=rel(1.2))
        )

What is the pattern when excluding participants with 0% atelectasis?

data %>% filter (atelectasis_percent>0) %>% 
  ggplot(aes(BMI,atelectasis_percent)) +
  geom_point(size=0.6,color="gray40") + 
  geom_smooth(method="loess", color="darkblue") +
  ylab("Atelectasis percent (%)") + xlab("Body mass index (kg/m²)")  +
  theme_bw() + 
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

GAM model with k=2:

model<-gam(atelectasis_percent~s(BMI,k=2))
AIC_k2 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## atelectasis_percent ~ s(BMI, k = 2)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6589     0.2248   11.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 1.963  1.999 97.75  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.457   Deviance explained = 46.2%
## GCV = 12.078  Scale est. = 11.926    n = 236
k.check(model)
##        k'      edf   k-index p-value
## s(BMI)  2 1.963441 0.9317886    0.14
plot(model)

GAM model with k=4:

model<-gam(atelectasis_percent~s(BMI,k=4))
AIC_k4 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## atelectasis_percent ~ s(BMI, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6589     0.2141   12.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 2.937  2.997 81.98  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.508   Deviance explained = 51.4%
## GCV = 11.001  Scale est. = 10.817    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  3 2.937474 1.038686  0.6825
plot(model)

GAM model with k=6:

model<-gam(atelectasis_percent~s(BMI,k=6))
AIC_k6 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## atelectasis_percent ~ s(BMI, k = 6)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.659      0.211    12.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 4.572   4.91 52.93  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.522   Deviance explained = 53.1%
## GCV =  10.76  Scale est. = 10.506    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  5 4.572447 1.077343  0.8925
plot(model)

GAM model with k=8:

model<-gam(atelectasis_percent~s(BMI,k=8))
AIC_k8 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## atelectasis_percent ~ s(BMI, k = 8)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6589     0.2115   12.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 4.764   5.67 44.97  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.519   Deviance explained = 52.9%
## GCV = 10.821  Scale est. = 10.557    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  7 4.764266 1.073694  0.8725
plot(model)

GAM model with k=10:

model<-gam(atelectasis_percent~s(BMI,k=10))
AIC_k10 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## atelectasis_percent ~ s(BMI, k = 10)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6589     0.2114   12.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##         edf Ref.df     F p-value    
## s(BMI) 4.92  5.982 42.62  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.52   Deviance explained =   53%
## GCV = 10.822  Scale est. = 10.55     n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  9 4.920424 1.075019    0.87
plot(model)

Best AIC:

list(AIC_k2,AIC_k4,AIC_k6,AIC_k8,AIC_k10)
## [[1]]
## [1] 1259.668
## 
## [[2]]
## [1] 1237.596
## 
## [[3]]
## [1] 1232.312
## 
## [[4]]
## [1] 1233.637
## 
## [[5]]
## [1] 1233.634

All models are significantly better than linear. Thus, using a smooth term for BMI to predict atelectasis percent is better than modelling a linear relationship.

Regarding AIC, greatest improvement in AIC is k=6. Will model with k=5 and k=7 to compare

GAM model with k=5:

model<-gam(atelectasis_percent~s(BMI,k=5))
AIC_k5 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## atelectasis_percent ~ s(BMI, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6589     0.2129   12.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.712  3.951 63.63  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.513   Deviance explained = 52.1%
## GCV = 10.914  Scale est. = 10.696    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  4 3.712457 1.059789  0.7925
plot(model)

GAM model with k=7:

model<-gam(atelectasis_percent~s(BMI,k=7))
AIC_k7 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## atelectasis_percent ~ s(BMI, k = 7)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6589     0.2113   12.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 4.747  5.454 47.01  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.52   Deviance explained =   53%
## GCV = 10.801  Scale est. = 10.538    n = 236
k.check(model)
##        k'      edf  k-index p-value
## s(BMI)  6 4.746815 1.075589    0.91
plot(model)

Best AIC:

list(AIC_k5,AIC_k6,AIC_k7)
## [[1]]
## [1] 1235.685
## 
## [[2]]
## [1] 1232.312
## 
## [[3]]
## [1] 1233.19

k=6 offers the lowest AIC. Will keep k=6 to model.

fig1a <- ggplot(data, aes(BMI,atelectasis_percent)) + 
  geom_point(size=0.6,color="gray60") +
  geom_smooth(method="gam",formula = y ~ s(x, bs = "cs", k = 6), color="darkblue") + 
  labs(y="Atelectasis percent (%)",
       x="Body mass index (kg/m²)",
       tag="A") +
  xlim(30,80) +
  theme_bw() + 
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

fig1a

Negative monotonic relationship since SpO2 decreases as BMI increases.

Will assess Spearman’s correlation again only to have a rough idea (will not report this in the paper):

spearman <- cor.test(spo2_VPO,atelectasis_percent,method="spearman",exact=FALSE)
spearman
## 
##  Spearman's rank correlation rho
## 
## data:  spo2_VPO and atelectasis_percent
## S = 3883525, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.7727568

Atelectasis percent exhibited a negative non-linear monotonic relationship with SpO2 (Figure 1A, rho= -0.773, p<0.001).

Interestingly, this figure is almost a mirror image of the priorly created plot for SpO2 ~ BMI.

Figure 1a
ggsave("Figure1a.png",plot=fig1a,
       path=figfolder, 
       width = 8,  height = 6, units = "in", 
       dpi = 300
       )

Atelectasis - age

boxplot(age~atelectasis)

Assess distribution of age by atelectasis (yes/no):

data_age <- data %>% group_by(atelectasis)

ggplot(data_age,aes(x = age)) +
  geom_histogram(fill = "lightsteelblue4", colour = "black") +
  facet_grid(atelectasis ~ .)

qqPlot(age ~ atelectasis, data=data_age)

Distribution near-normal, will assess mean and variance for further testing:

mean_age <- data_age %>% 
  summarise(n=n(),
            age_mean = mean(age),
            sd = sd(age),
            variance = var(age)
            )
mean_age
## # A tibble: 2 × 5
##   atelectasis     n age_mean    sd variance
##   <fct>       <int>    <dbl> <dbl>    <dbl>
## 1 Yes            77     39.6  9.30     86.6
## 2 No            159     40.6 10.1     103.

Variances near-similar, but group sizes differ. Welch’s t-test more suitable:

t_test <- t.test(age ~ atelectasis, data = data_age)
t_test
## 
##  Welch Two Sample t-test
## 
## data:  age by atelectasis
## t = -0.67931, df = 162.72, p-value = 0.4979
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
##  -3.532230  1.724013
## sample estimates:
## mean in group Yes  mean in group No 
##          39.64935          40.55346

Age was similarly distributed among patients without atelectasis (40.6, sd:10.1) and those with atelectasis (39.6, sd:9.3) (p=0.498).

Atelectasis - sex

Mean expected frequency:

mean_exp <- data %>% drop_na(sex, atelectasis) %>% 
  summarize(mean_expected_freq = n()/(nlevels(sex)*nlevels(atelectasis)))

mean_exp
##   mean_expected_freq
## 1                 59

Mean expected frequency is greater than 5.0, so chi-squared without continuity correction is adequate.

freq <- table(sex, atelectasis)
freq
##        atelectasis
## sex     Yes  No
##   Woman  67 147
##   Man    10  12
round(prop.table(freq),4)
##        atelectasis
## sex        Yes     No
##   Woman 0.2839 0.6229
##   Man   0.0424 0.0508
percent <- round((prop.table(freq, 1)*100),2)
percent
##        atelectasis
## sex       Yes    No
##   Woman 31.31 68.69
##   Man   45.45 54.55

Mosaic Plot

data %>% 
  mutate(atelectasis = fct_relevel(atelectasis, "No", "Yes")) %>%
ggplot() +
  geom_mosaic(aes(x = product(atelectasis,sex), fill=atelectasis),na.rm = TRUE) +
  scale_fill_manual(values=c("grey95","lightsteelblue4")) +
  theme_mosaic() + 
  theme(axis.text.x=element_text(size=rel(0.8)))

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 1.8161, df = 1, p-value = 0.1778

There were no significant differences in atelectasis ocurrence between men (45.5%) and women (31.3%) (p=0.178).

Atelectasis - OSA

Mean expected frequency:

mean_exp <- data %>% 
  drop_na(sleep_apnea, atelectasis) %>% 
  summarize(mean_expected_freq = n()/(nlevels(sleep_apnea)*nlevels(atelectasis)))

mean_exp
##   mean_expected_freq
## 1                 59

Mean expected frequency is greater than 5.0, so chi-squared without continuity correction is adequate.

freq <- table(sleep_apnea, atelectasis)
freq
##            atelectasis
## sleep_apnea Yes  No
##         No   60 158
##         Yes  17   1
round(prop.table(freq),4)
##            atelectasis
## sleep_apnea    Yes     No
##         No  0.2542 0.6695
##         Yes 0.0720 0.0042
percent <- round((prop.table(freq, 1)*100),2)
percent
##            atelectasis
## sleep_apnea   Yes    No
##         No  27.52 72.48
##         Yes 94.44  5.56

Mosaic Plot

data %>% 
  mutate(atelectasis = fct_relevel(atelectasis, "No", "Yes")) %>%
ggplot() +
  geom_mosaic(aes(x = product(atelectasis,sleep_apnea), fill=atelectasis),na.rm = TRUE) +
  scale_fill_manual(values=c("grey95","lightsteelblue4")) +
  theme_mosaic() + 
  theme(axis.text.x=element_text(size=rel(0.8)))

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 33.875, df = 1, p-value = 5.876e-09

Patients with a diagnosis of obstructive sleep apnea had atelectasis more frequently (94.4%) than those without the diagnosis (27.5%) (p<0.001).

Atelectasis location by OSA

Mean expected frequency:

mean_exp <- data %>% 
  drop_na(sleep_apnea, atelectasis_location) %>% 
  summarize(mean_expected_freq = n()/(nlevels(sleep_apnea)*nlevels(atelectasis_location)))

mean_exp
##   mean_expected_freq
## 1              19.25

Mean expected frequency is greater than 5.0, so chi-squared without continuity correction is adequate.

freq <- table(sleep_apnea, atelectasis_location)
freq
##            atelectasis_location
## sleep_apnea Unilateral Bilateral
##         No          43        17
##         Yes         10         7
round(prop.table(freq),4)
##            atelectasis_location
## sleep_apnea Unilateral Bilateral
##         No      0.5584    0.2208
##         Yes     0.1299    0.0909
percent <- round((prop.table(freq, 1)*100),2)
percent
##            atelectasis_location
## sleep_apnea Unilateral Bilateral
##         No       71.67     28.33
##         Yes      58.82     41.18

Mosaic Plot

ggplot(data = data) +
  geom_mosaic(aes(x = product(atelectasis_location,sleep_apnea), 
                  fill=atelectasis_location),na.rm = TRUE) +
  scale_fill_manual(values=c("seagreen1","seagreen4")) +
  theme_mosaic() 

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 1.0185, df = 1, p-value = 0.3129

The location of atelectasis was not different among patients with and without OSA (p=0.313).

Atelectasis - SpO2

data_spo2 <- data %>% group_by(atelectasis) 

median_spo2 <- data_spo2 %>% summarize(n = n(),
                                       spo2_median = median(spo2_VPO), 
                                       Q1 = quantile(spo2_VPO,0.25), 
                                       Q3 = quantile(spo2_VPO,0.75), 
                                       min = min(spo2_VPO), 
                                       max = max(spo2_VPO)
                                       )
median_spo2
## # A tibble: 2 × 7
##   atelectasis     n spo2_median    Q1    Q3   min   max
##   <fct>       <int>       <int> <dbl> <dbl> <int> <int>
## 1 Yes            77          92    91    93    88    97
## 2 No            159          97    96    98    89    99
boxplot(spo2_VPO ~ atelectasis)

Distribution not normal and influential outliers. Will assess non-parametrically.

wil <- wilcox.test(spo2_VPO ~ atelectasis, data = data_spo2, exact = FALSE)
wil
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  spo2_VPO by atelectasis
## W = 465.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

The median SpO2 was significantly lower in patients with atelectasis (92, IQR: 91-93) compared to those without (97, IQR: 96-98) (p<0.001).

Atelectasis location - SpO2

data_spo2 <- data %>% 
  group_by(atelectasis_location) %>% 
  drop_na(atelectasis_location)

median_spo2 <- data_spo2 %>% summarize(n = n(),
                                       spo2_median = median(spo2_VPO), 
                                       Q1 = quantile(spo2_VPO,0.25), 
                                       Q3 = quantile(spo2_VPO,0.75), 
                                       min = min(spo2_VPO), 
                                       max = max(spo2_VPO)
                                       )
median_spo2
## # A tibble: 2 × 7
##   atelectasis_location     n spo2_median    Q1    Q3   min   max
##   <fct>                <int>       <dbl> <dbl> <dbl> <int> <int>
## 1 Unilateral              53        92      92    93    88    95
## 2 Bilateral               24        91.5    90    92    88    97
boxplot(spo2_VPO ~ atelectasis_location, data=data_spo2)

Distribution not normal and influential outliers. Will assess non-parametrically.

wil <- wilcox.test(spo2_VPO ~ atelectasis_location, data = data_spo2, exact = FALSE)
wil
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  spo2_VPO by atelectasis_location
## W = 879, p-value = 0.006227
## alternative hypothesis: true location shift is not equal to 0

The median SpO2 was significantly lower in patients with bilateral atelectasis (91.5, IQR: 90-92) compared to those with unilateral atelectasis (92, IQR: 92-93) (p=0.006).

Atelectasis percent - SpO2

Smooth term?

Scatterplot

plot(spo2_VPO~atelectasis_percent, data=data, 
     main="Scatterplot", 
     xlab="Atelectasis percent (%)", 
     ylab="SpO2 (%)")

Decreasing SpO2 as atelectasis percent increases.

Would a smooth term be more useful to model SpO2?

ggplot(data, aes(atelectasis_percent,spo2_VPO)) + 
  geom_point(size=0.6,color="gray40") + 
  geom_smooth(method="loess", color="black") +
  ylab("SpO2 (%)") + xlab("Atelectasis percent (%)") + 
  ylim(85,100) +
  theme_bw() + 
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

GAM model with k=2:

model<-gam(spo2_VPO~s(atelectasis_percent,k=2))
AIC_k2 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(atelectasis_percent, k = 2)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1032   920.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df     F p-value    
## s(atelectasis_percent) 1.972  1.999 235.1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.664   Deviance explained = 66.7%
## GCV = 2.5447  Scale est. = 2.5127    n = 236
k.check(model)
##                        k'      edf   k-index p-value
## s(atelectasis_percent)  2 1.971542 0.9633485   0.265
plot(model)

GAM model with k=4:

model<-gam(spo2_VPO~s(atelectasis_percent,k=4))
AIC_k4 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(atelectasis_percent, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1023   928.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df     F p-value    
## s(atelectasis_percent) 2.522  2.833 168.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.67   Deviance explained = 67.3%
## GCV = 2.5076  Scale est. = 2.4702    n = 236
k.check(model)
##                        k'      edf   k-index p-value
## s(atelectasis_percent)  3 2.522477 0.9820889  0.3675
plot(model)

GAM model with k=6:

model<-gam(spo2_VPO~s(atelectasis_percent,k=6))
AIC_k6 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(atelectasis_percent, k = 6)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1014   936.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df     F p-value    
## s(atelectasis_percent) 3.731  4.296 113.9  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.675   Deviance explained =   68%
## GCV = 2.4784  Scale est. = 2.4287    n = 236
k.check(model)
##                        k'      edf  k-index p-value
## s(atelectasis_percent)  5 3.730877 1.000348  0.4325
plot(model)

GAM model with k=8:

model<-gam(spo2_VPO~s(atelectasis_percent,k=8))
AIC_k8 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(atelectasis_percent, k = 8)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1014   936.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df     F p-value    
## s(atelectasis_percent) 3.895  4.634 105.7  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.675   Deviance explained = 68.1%
## GCV = 2.4784  Scale est. = 2.427     n = 236
k.check(model)
##                        k'      edf  k-index p-value
## s(atelectasis_percent)  7 3.894793 1.001219  0.5175
plot(model)

Best AIC:

list(AIC_k2,AIC_k4,AIC_k6,AIC_k8)
## [[1]]
## [1] 892.1322
## 
## [[2]]
## [1] 888.6461
## 
## [[3]]
## [1] 885.8384
## 
## [[4]]
## [1] 885.8287

All models are significantly better than linear. Thus, using a smooth term for atelectasis percent is better than modelling a linear relationship.

Regarding AIC, no model offers greater improvement in AIC than k=6. Will try a model with k=5:

GAM model with k=5:

model<-gam(spo2_VPO~s(atelectasis_percent,k=5))
AIC_k5 <- AIC(model)
summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(atelectasis_percent, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1013   937.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df     F p-value    
## s(atelectasis_percent) 3.677  3.932 125.1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.676   Deviance explained = 68.1%
## GCV = 2.4703  Scale est. = 2.4213    n = 236
k.check(model)
##                        k'      edf  k-index p-value
## s(atelectasis_percent)  4 3.676746 1.002302   0.485
plot(model)

Best AIC:

list(AIC_k4,AIC_k5,AIC_k6)
## [[1]]
## [1] 888.6461
## 
## [[2]]
## [1] 885.0655
## 
## [[3]]
## [1] 885.8384

There is a drop in AIC for k=5, which also offers the best k-index. Nonetheless, one problem with this is that the extra knot is explaining a clump around 12.%, for which there was only one single observation. Thus, it is likely that this clump and additional knot is only explaining noise in the data, and would thus not be a good representation of the trend in the variable. Thus, will keep k=4 to model as this model offers the best visual representation of the trend in all categories.

fig1c <- ggplot(data, aes(atelectasis_percent,spo2_VPO)) + 
  geom_point(size=0.6, color="gray60",
             position = position_jitter(seed = 1, width = .2)) +
  geom_smooth(method="gam",formula = y ~ s(x, bs = "cs", k = 4), color="black") + 
  ylim(83,100) +
  labs(y="SpO2 (%)",x="Atelectasis percent (%)",tag="C") +
  theme_bw() + 
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

fig1c

Negative monotonic relationship since SpO2 decreases as BMI increases. Will assess Spearman’s correlation coefficient:

spearman <- cor.test(spo2_VPO,atelectasis_percent,method="spearman",exact=FALSE)
spearman
## 
##  Spearman's rank correlation rho
## 
## data:  spo2_VPO and atelectasis_percent
## S = 3883525, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.7727568

Atelectasis percent exhibited a negative non-linear monotonic relationship with SpO2 (Figure 1C, rho= -0.773, p<0.001).

Figure 1c
ggsave("Figure1c.png",plot=fig1c,
       path=figfolder, 
       width = 8,  height = 6, units = "in", 
       dpi = 300)

Figure 1

blank_plot = ggplotGrob(ggplot(NULL)); 
blank_plot$grobs[[max(which(blank_plot$layout$name == "panel"))]] = nullGrob(); 
grid.draw(blank_plot)

figure1 <- grid.arrange(fig1a, blank_plot, fig1b, fig1c, ncol=2)

Save figure:

ggsave("Figure1.png",plot=figure1,
       path=figfolder, 
       width = 12,  height = 10, units = "in", 
       dpi = 300)

Ordinal variable

Since there is only one participant in the 30% category, will collapse with the 20 category for further analyses:

datamodel <- data
datamodel$atelectasis_percent <- factor(datamodel$atelectasis_percent, 
                                        levels=c(0,2.5,5,7.5,10,12.5,15,17.5,27.5)) %>% 
  fct_collapse("17.5%" = c(17.5,27.5)) %>% 
  factor(labels = c("0%","2.5%","5%","7.5%","10%","12.5%","15%","17.5%"))

table(datamodel$atelectasis_percent)
## 
##    0%  2.5%    5%  7.5%   10% 12.5%   15% 17.5% 
##   159    11    14    33     6     1     4     8

Assess distribution of SpO2 by atelectasis percent categories:

data_spo2 <- datamodel %>% group_by(atelectasis_percent)

ggplot(data_spo2,aes(x = spo2_VPO)) +
  geom_histogram(fill = "aquamarine", colour = "black") +
  facet_grid(atelectasis_percent ~ .)

Distribution not normal, group sizes are different and there are outliers in both directions, depending where you are located. Thus, will proceed with non-parametric assessment:

median_spo2 <- data_spo2 %>% 
  summarize(n = n(), 
            spo2_median = median(spo2_VPO), 
            Q1 = quantile(spo2_VPO,0.25), 
            Q3 = quantile(spo2_VPO,0.75), 
            min = min(spo2_VPO), 
            max = max(spo2_VPO)
            )
median_spo2
## # A tibble: 8 × 7
##   atelectasis_percent     n spo2_median    Q1    Q3   min   max
##   <fct>               <int>       <dbl> <dbl> <dbl> <int> <int>
## 1 0%                    159        97    96    98      89    99
## 2 2.5%                   11        94    94    94      93    95
## 3 5%                     14        92    92    93      91    94
## 4 7.5%                   33        92    92    93      88    97
## 5 10%                     6        91    91    91      90    92
## 6 12.5%                   1        92    92    92      92    92
## 7 15%                     4        91.5  90.5  92      89    92
## 8 17.5%                   8        88.5  88    89.2    88    92
krus <- kruskal.test(spo2_VPO ~ atelectasis_percent, data = data_spo2)
krus
## 
##  Kruskal-Wallis rank sum test
## 
## data:  spo2_VPO by atelectasis_percent
## Kruskal-Wallis chi-squared = 141.19, df = 7, p-value < 2.2e-16

There was a decreasing trend in median SpO2 with higher atelectasis percentage extension (p<0.001).

ggplot(datamodel, aes(x = atelectasis_percent, y = spo2_VPO)) + 
  geom_boxplot(width = .4, outlier.shape = NA, fill="aliceblue") +
  geom_point(size = 1.5, alpha = .3, 
             position = position_jitter(seed = 1, width = .1)) + 
  ylab("SpO2 (%)") + xlab("Atelectasis percent") + 
  ylim(85,100) +
  labs(tag="Kruskall-Wallis: p<0.001") + 
  theme_bw() + 
  theme(panel.border = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        axis.ticks.length.x = unit(.1, "in"),
        axis.text.x = element_text(size=rel(1.2)),
        axis.text.y = element_text(size=rel(1.2)),
        plot.tag.position = "top", 
        plot.tag = element_text(size=rel(0.9))
        )

Relationship between OSA, obesity type and atelectasis percent:

datamodel %>% mutate(
  type_obesity=fct_recode(type_obesity, 
                          "1"="Class 1 Obesity", 
                          "2"="Class 2 Obesity", 
                          "3"="Class 3 Obesity")) %>% 
  count(type_obesity, sleep_apnea, atelectasis_percent) %>%
  ggplot(aes(x = sleep_apnea, y = atelectasis_percent, color = sleep_apnea)) +
  geom_point(aes(group = sleep_apnea, size = n)) +
  scale_color_manual(values=c("gray40","darkred")) +
  facet_wrap(~type_obesity, scales = "free_x",
             labeller = labeller(type_obesity = label_both)) +
  scale_size(breaks = c(1, 10, 20, 30, 40, 50, 60)) + 
  theme_light()

Sleep apnea was more common with higher BMI categories and also with higher atelectasis percentage. Atelectasis percent increases at higher obesity classes.

Statistical Modelling

Plot DAG model to visualizes again:

plot(DAG)

Implied conditional independencies in DAG:

This procedure was performed as suggested in this article.

## age _||_ alt_
## age _||_ sex
## alt_ _||_ sex
## alt_ _||_ typ_ | age, sex, slp_
## hb _||_ slp_ | age, alt_, atl_, sex
## hb _||_ typ_ | age, alt_, atl_, sex

Test conditional independencies:

subsetcondit <- datamodel %>% select(c("age",
                                       "sex",
                                       "type_obesity",
                                       "spo2_VPO",
                                       "hb",
                                       "sleep_apnea",
                                       "atelectasis_percent",
                                       "altitude_cat")
                                     ) 

subsetcondit <- subsetcondit %>% 
  mutate(sex = as.numeric(sex),
         sleep_apnea = as.numeric(sleep_apnea)
         )

corr <- lavCor(subsetcondit,
               ordered=c("sex",
                         "type_obesity",
                         "sleep_apnea",
                         "atelectasis_percent",
                         "altitude_cat",
                         "spo2_VPO")
               )

corr
##                        age    sex typ_bs s2_VPO     hb slp_pn atlct_ alttd_
## age                  1.000                                                 
## sex                  0.022  1.000                                          
## type_obesity        -0.142  0.276  1.000                                   
## spo2_VPO             0.014 -0.102 -0.344  1.000                            
## hb                   0.071  0.403  0.052 -0.045  1.000                     
## sleep_apnea          0.064  0.645  0.286 -0.622  0.219  1.000              
## atelectasis_percent -0.071  0.180  0.460 -0.916  0.069  0.617  1.000       
## altitude_cat         0.014 -0.119 -0.054 -0.063 -0.027  0.193  0.013  1.000
localtests <- localTests(DAG, sample.cov=corr, sample.nobs=nrow( subsetcondit ) )
localtests
##                                        estimate    p.value       2.5%
## age _||_ alt_                        0.01389636 0.83200380 -0.1140077
## age _||_ sex                         0.02162883 0.74124789 -0.1063663
## alt_ _||_ sex                       -0.11910345 0.06774401 -0.2431642
## alt_ _||_ typ_ | age, sex, slp_     -0.08294080 0.20739686 -0.2092591
## hb _||_ slp_ | age, alt_, atl_, sex -0.10913579 0.09729158 -0.2346846
## hb _||_ typ_ | age, alt_, atl_, sex -0.06364414 0.33483830 -0.1908952
##                                           97.5%
## age _||_ alt_                       0.141349797
## age _||_ sex                        0.148922919
## alt_ _||_ sex                       0.008729808
## alt_ _||_ typ_ | age, sex, slp_     0.046071800
## hb _||_ slp_ | age, alt_, atl_, sex 0.019943221
## hb _||_ typ_ | age, alt_, atl_, sex 0.065693116
plotLocalTestResults(localtests)

Conditional independence assumption OK. Minimal set of adjustment is age, sex, and sleep_apnea.

Prevalence Ratio

This paper and accompanying code were used to calculate prevalence ratios.

A modified Poisson regression model with robust errors will be applied to obtain prevalence ratios.

Crude prevalence ratio

dataprev <- data %>% 
  mutate(atelectasis = as.numeric(
    as.character(
      fct_recode(atelectasis,
                 "0"="No",
                 "1"="Yes"))))


poisson_fit <- glm(atelectasis ~ type_obesity,
                   data = dataprev,
                   family = poisson(link = log)
                   )

tidy(poisson_fit, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 3 × 7
##   term                   estimate std.error statistic p.value conf.low conf.high
##   <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)               0.129     0.353     -5.79 6.92e-9   0.0590     0.240
## 2 type_obesityClass 2 O…    2.19      0.438      1.79 7.28e-2   0.953      5.45 
## 3 type_obesityClass 3 O…    3.46      0.379      3.28 1.05e-3   1.75       7.87

Robust standard errors:

covmat <- vcovHC(poisson_fit, type = "HC0")
covmat
##                             (Intercept) type_obesityClass 2 Obesity
## (Intercept)                    0.108871                  -0.1088710
## type_obesityClass 2 Obesity   -0.108871                   0.1566697
## type_obesityClass 3 Obesity   -0.108871                   0.1088710
##                             type_obesityClass 3 Obesity
## (Intercept)                                   -0.108871
## type_obesityClass 2 Obesity                    0.108871
## type_obesityClass 3 Obesity                    0.119125
#Calculate the standard error
se <- sqrt(diag(covmat))

# Bind together model output
#  1. exponentiated coefficients
#  2. robust standard errors
#  3. 95% confidence intervals
# Note that qnorm(0.975) approximately equals 1.96
model_output <- cbind(
  Estimate = exp(coef(poisson_fit)),
  `Robust SE` = se,
  Lower = exp(coef(poisson_fit) - qnorm(0.975) * se),
  Upper = exp(coef(poisson_fit) + qnorm(0.975) * se)
)

# Coerce model_output into a data frame
model_output <- as.data.frame(round(model_output,2))
model_output
##                             Estimate Robust SE Lower Upper
## (Intercept)                     0.13      0.33  0.07  0.25
## type_obesityClass 2 Obesity     2.19      0.40  1.01  4.76
## type_obesityClass 3 Obesity     3.46      0.35  1.76  6.80

Adjusted prevalence ratio

poisson_fit <- glm(atelectasis ~ type_obesity + age + sex + sleep_apnea,
data = dataprev,
family = poisson(link = log))

tidy(poisson_fit, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 6 × 7
##   term                   estimate std.error statistic p.value conf.low conf.high
##   <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)               0.140    0.615     -3.20  1.38e-3   0.0398     0.447
## 2 type_obesityClass 2 O…    2.13     0.439      1.72  8.47e-2   0.924      5.30 
## 3 type_obesityClass 3 O…    3.10     0.384      2.95  3.21e-3   1.55       7.12 
## 4 age                       0.997    0.0121    -0.283 7.78e-1   0.973      1.02 
## 5 sexMan                    0.722    0.380     -0.859 3.90e-1   0.326      1.46 
## 6 sleep_apneaYes            3.33     0.307      3.92  8.85e-5   1.77       5.92

Robust standard errors:

covmat <- vcovHC(poisson_fit, type = "HC0")
covmat
##                              (Intercept) type_obesityClass 2 Obesity
## (Intercept)                  0.217765534               -9.742767e-02
## type_obesityClass 2 Obesity -0.097427669                1.378078e-01
## type_obesityClass 3 Obesity -0.097289004                9.657037e-02
## age                         -0.003104120                7.319695e-05
## sexMan                       0.007684832                1.033022e-02
## sleep_apneaYes               0.003376050               -1.234191e-02
##                             type_obesityClass 3 Obesity           age
## (Intercept)                               -9.728900e-02 -3.104120e-03
## type_obesityClass 2 Obesity                9.657037e-02  7.319695e-05
## type_obesityClass 3 Obesity                1.078039e-01  8.049292e-05
## age                                        8.049292e-05  7.871795e-05
## sexMan                                    -3.306396e-03 -2.475432e-04
## sleep_apneaYes                            -1.090238e-02 -9.687233e-05
##                                    sexMan sleep_apneaYes
## (Intercept)                  0.0076848325   3.376050e-03
## type_obesityClass 2 Obesity  0.0103302170  -1.234191e-02
## type_obesityClass 3 Obesity -0.0033063959  -1.090238e-02
## age                         -0.0002475432  -9.687233e-05
## sexMan                       0.0591480165  -1.874945e-02
## sleep_apneaYes              -0.0187494458   2.613647e-02
#Calculate the standard error
se <- sqrt(diag(covmat))

# Bind together model output
#  1. exponentiated coefficients
#  2. robust standard errors
#  3. 95% confidence intervals
# Note that qnorm(0.975) approximately equals 1.96
model_output2 <- cbind(
  Estimate = exp(coef(poisson_fit)),
  `Robust SE` = se,
  Lower = exp(coef(poisson_fit) - qnorm(0.975) * se),
  Upper = exp(coef(poisson_fit) + qnorm(0.975) * se)
)

# Coerce model_output into a data frame
model_output2 <- as.data.frame(round(model_output2,2))
model_output2
##                             Estimate Robust SE Lower Upper
## (Intercept)                     0.14      0.47  0.06  0.35
## type_obesityClass 2 Obesity     2.13      0.37  1.03  4.41
## type_obesityClass 3 Obesity     3.10      0.33  1.63  5.90
## age                             1.00      0.01  0.98  1.01
## sexMan                          0.72      0.24  0.45  1.16
## sleep_apneaYes                  3.33      0.16  2.43  4.57

Table 2

model_output <- model_output %>% 
  dplyr::slice(2:3) %>%
  rename(PR = Estimate, SE = `Robust SE`) %>% 
  mutate("95%CI" = paste(Lower,Upper,sep="-")) %>% 
  select(-c(Lower,Upper))

model_output2 <- model_output2 %>% 
  dplyr::slice(2:3) %>%
  rename(aPR = Estimate, aSE = `Robust SE`) %>% 
  mutate("a95%CI" = paste(Lower,Upper,sep="-")) %>% 
  select(-c(Lower,Upper))

table2 <- dplyr::bind_cols(model_output, model_output2) %>% 
  rownames_to_column(var = "Category") %>% 
  mutate_at("Category", str_replace, "type_obesity", "")

flextable(table2) %>% 
  save_as_docx(path = paste(tabfolder,"/Table2.docx",sep=""))

table2
##          Category   PR   SE     95%CI  aPR  aSE    a95%CI
## 1 Class 2 Obesity 2.19 0.40 1.01-4.76 2.13 0.37 1.03-4.41
## 2 Class 3 Obesity 3.46 0.35  1.76-6.8 3.10 0.33  1.63-5.9

Ordinal Logistic Regression Model

Some code available in the following resource was used for these analyses:
- Dunn, Taylor. 2020. “Ordinal Regression in R: Part 1.” March 15, 2020.

Rename type obesity levels to facilitate reading of results:

datamodel <- datamodel %>% 
  mutate(type_obesity=fct_recode(type_obesity,
                                 "1"="Class 1 Obesity",
                                 "2"="Class 2 Obesity",
                                 "3"="Class 3 Obesity"),
         atelectasis_percent=ordered(atelectasis_percent)
         ) 

Visualize pattern of atelectasis percent increase by obesity type category.

datamodel %>% 
  ggplot(aes(x = type_obesity, fill = atelectasis_percent)) + 
  geom_bar() +  
  scale_fill_manual(values = brewer.pal(8,"Blues")) + 
  theme_minimal()

Univariable models

The loglog link function will be used for all models since this distribution provides the best fit for the fully adjusted model:

datamodel %>%
  nest(data = everything()) %>%
 crossing(
    link = c("logit", "probit", "cloglog", "loglog", "cauchit"),
    threshold = c("flexible", "equidistant", "symmetric", "symmetric2")
  ) %>%
  mutate(
    mod = map2(
      data, link,
      ~clm(atelectasis_percent ~ type_obesity + sex + age + sleep_apnea, 
        data = .x, link = .y
      )
    )
  ) %>%
  mutate(
    mod_summary = map(
      mod,
      ~glance(.x) %>% mutate(logLik = as.numeric(logLik))
    )
  ) %>%
  unnest(mod_summary) %>%
  dplyr::select(link, threshold, logLik, edf, AIC, BIC) %>%
  arrange(logLik) %>%
  gt()
link threshold logLik edf AIC BIC
cauchit equidistant -254.5032 12 533.0064 574.5724
cauchit flexible -254.5032 12 533.0064 574.5724
cauchit symmetric -254.5032 12 533.0064 574.5724
cauchit symmetric2 -254.5032 12 533.0064 574.5724
cloglog equidistant -252.1230 12 528.2459 569.8119
cloglog flexible -252.1230 12 528.2459 569.8119
cloglog symmetric -252.1230 12 528.2459 569.8119
cloglog symmetric2 -252.1230 12 528.2459 569.8119
probit equidistant -248.6292 12 521.2584 562.8244
probit flexible -248.6292 12 521.2584 562.8244
probit symmetric -248.6292 12 521.2584 562.8244
probit symmetric2 -248.6292 12 521.2584 562.8244
logit equidistant -248.3580 12 520.7161 562.2820
logit flexible -248.3580 12 520.7161 562.2820
logit symmetric -248.3580 12 520.7161 562.2820
logit symmetric2 -248.3580 12 520.7161 562.2820
loglog equidistant -247.2876 12 518.5752 560.1412
loglog flexible -247.2876 12 518.5752 560.1412
loglog symmetric -247.2876 12 518.5752 560.1412
loglog symmetric2 -247.2876 12 518.5752 560.1412

Obesity class:

fit_BMI <- clm(atelectasis_percent~type_obesity,
               data = datamodel, 
               link = "loglog", threshold = "flexible")

summary(fit_BMI)
## formula: atelectasis_percent ~ type_obesity
## data:    datamodel
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -260.47 538.94 9(2)  8.12e-09 9.3e+02
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## type_obesity2   0.8277     0.4379   1.890   0.0587 .  
## type_obesity3   1.4929     0.3793   3.936 8.28e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##           Estimate Std. Error z value
## 0%|2.5%     1.9847     0.3536   5.613
## 2.5%|5%     2.1794     0.3578   6.091
## 5%|7.5%     2.4687     0.3648   6.767
## 7.5%|10%    3.5817     0.4105   8.725
## 10%|12.5%   3.9794     0.4395   9.054
## 12.5%|15%   4.0624     0.4468   9.092
## 15%|17.5%   4.4794     0.4914   9.115
nominal_test(fit_BMI)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ type_obesity
##              Df  logLik    AIC LRT Pr(>Chi)
## <none>          -260.47 538.94             
## type_obesity
scale_test(fit_BMI)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ type_obesity
##              Df  logLik    AIC    LRT Pr(>Chi)  
## <none>          -260.47 538.94                  
## type_obesity  2 -257.80 537.60 5.3462  0.06904 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Output not showing result of nominal test. When running traceback, this seems to be a problem of failure to converge of the nominal test function. However, this does not constitute a problem in the model convergence itself as shown bellow:

convergence(fit_BMI)
##  nobs logLik  niter max.grad cond.H  logLik.Error
##  236  -260.47 9(2)  8.12e-09 9.3e+02 <1e-10      
## 
##               Estimate Std.Err  Gradient     Error Cor.Dec Sig.Dig
## 0%|2.5%         1.9847  0.3536 -8.12e-09 -1.07e-09       8       9
## 2.5%|5%         2.1794  0.3578 -1.83e-10 -1.07e-09       8       9
## 5%|7.5%         2.4687  0.3648 -2.22e-10 -1.07e-09       8       9
## 7.5%|10%        3.5817  0.4105 -6.20e-11 -1.07e-09       8       9
## 10%|12.5%       3.9794  0.4395 -2.98e-13 -1.07e-09       8       9
## 12.5%|15%       4.0624  0.4468  2.38e-13 -1.07e-09       8       9
## 15%|17.5%       4.4794  0.4914 -6.06e-12 -1.07e-09       8       9
## type_obesity2   0.8277  0.4379  2.55e-15 -1.07e-09       8       8
## type_obesity3   1.4929  0.3793  1.33e-14 -1.07e-09       8       9
## 
## Eigen values of Hessian:
## 732.0787 324.4480 249.4324  81.8157  66.7963  32.3935  19.1533   7.2541   0.7886 
## 
## Convergence message from clm:
## (0) successful convergence 
## In addition: Absolute and relative convergence criteria were met

This problem has been signalled in post1 and post2, without an available solution yet within the package. However, running a model with a nominal term for the explanatory variable and comparing them trough ANOVA is an alternative way of testing the proportional odds assumption:

fit_BMInom <- clm(atelectasis_percent ~ 1, nominal= ~ type_obesity, 
                  data = datamodel, 
                  link = "loglog", threshold = "flexible")
## Warning: (-1) Model failed to converge with max|grad| = 0.0490995 (tol = 1e-06) 
## In addition: maximum number of consecutive Newton modifications reached
summary(fit_BMInom)
## formula: atelectasis_percent ~ 1
## nominal: ~type_obesity
## data:    datamodel
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H  
##  loglog flexible  236  -240.69 523.38 12(3) 4.91e-02 -8.4e+16
## 
## Threshold coefficients:
##                         Estimate Std. Error z value
## 0%|2.5%.(Intercept)       2.0811         NA      NA
## 2.5%|5%.(Intercept)       2.4260         NA      NA
## 5%|7.5%.(Intercept)       2.6502         NA      NA
## 7.5%|10%.(Intercept)      6.0240         NA      NA
## 10%|12.5%.(Intercept)    28.9503         NA      NA
## 12.5%|15%.(Intercept)   -13.1247         NA      NA
## 15%|17.5%.(Intercept)    -2.6936         NA      NA
## 0%|2.5%.type_obesity2    -0.9203         NA      NA
## 2.5%|5%.type_obesity2    -0.4894         NA      NA
## 5%|7.5%.type_obesity2    -0.3551         NA      NA
## 7.5%|10%.type_obesity2   -2.0707         NA      NA
## 10%|12.5%.type_obesity2 -21.7026         NA      NA
## 12.5%|15%.type_obesity2  11.2433         NA      NA
## 15%|17.5%.type_obesity2   6.8281         NA      NA
## 0%|2.5%.type_obesity3    -1.5540         NA      NA
## 2.5%|5%.type_obesity3    -1.8479         NA      NA
## 5%|7.5%.type_obesity3    -1.7660         NA      NA
## 7.5%|10%.type_obesity3   -4.0662         NA      NA
## 10%|12.5%.type_obesity3 -26.5944         NA      NA
## 12.5%|15%.type_obesity3  15.5804         NA      NA
## 15%|17.5%.type_obesity3   5.5194         NA      NA

Nominal model failed to converge, likely due to many new subgroups needed to create a nominal model, many of which may have been empty.

One additional problem that may have led to the inability to test the proportional odds assumption is that there is one category with only one observation:

table(datamodel$atelectasis_percent)
## 
##    0%  2.5%    5%  7.5%   10% 12.5%   15% 17.5% 
##   159    11    14    33     6     1     4     8

It is known that having few observations per category does not affect the results of ordinal regression, and that some categories may need to be combined to assess proportional odds assumption. REF

Will collapse categories to test:

data_prop <- datamodel 
data_prop$atelectasis_percent <- fct_collapse(data_prop$atelectasis_percent,
                                              "0%" = c("0%","2.5%"),
                                              "5%" = c("5%","7.5%"),
                                              "10%" = c("10%","12.5%"),
                                              "15%" = c("15%","17.5%")
                                              )

table(data_prop$atelectasis_percent)
## 
##  0%  5% 10% 15% 
## 170  47   7  12
fit_BMI2 <- clm(atelectasis_percent~type_obesity,
                data = data_prop, 
                link = "loglog", threshold = "flexible")

summary(fit_BMI2)
## formula: atelectasis_percent ~ type_obesity
## data:    data_prop
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -176.83 363.66 8(0)  8.87e-10 1.1e+02
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## type_obesity2   0.4770     0.5402   0.883    0.377    
## type_obesity3   1.7145     0.4317   3.972 7.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     2.2884     0.4083   5.604
## 5%|10%    3.7049     0.4565   8.116
## 10%|15%   4.1871     0.4895   8.554
nominal_test(fit_BMI2)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ type_obesity
##              Df  logLik    AIC LRT Pr(>Chi)
## <none>          -176.83 363.66             
## type_obesity
scale_test(fit_BMI2)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ type_obesity
##              Df  logLik    AIC     LRT Pr(>Chi)
## <none>          -176.83 363.66                 
## type_obesity  2 -176.78 367.56 0.10419   0.9492
fit_BMInom2 <- clm(atelectasis_percent ~ 1, nominal= ~ type_obesity, 
                   data = data_prop, 
                   link = "loglog", threshold = "flexible")
## Warning: (-3) not all thresholds are increasing: fit is invalid 
## In addition: Absolute convergence criterion was met, but relative criterion was not met
summary(fit_BMInom2)
## formula: atelectasis_percent ~ 1
## nominal: ~type_obesity
## data:    data_prop
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -171.03 360.05 21(0) 6.23e-09 1.3e+11
## 
## Threshold coefficients:
##                         Estimate Std. Error z value
## 0%|5%.(Intercept)         2.4590     0.4473   5.497
## 5%|10%.(Intercept)       23.0042 12665.6882   0.002
## 10%|15%.(Intercept)      -3.2539 16494.5757   0.000
## 0%|5%.type_obesity2      -0.6488     0.5704  -1.137
## 5%|10%.type_obesity2    -19.7462 12665.6882  -0.002
## 10%|15%.type_obesity2     7.2146 16494.5758   0.000
## 0%|5%.type_obesity3      -1.8822     0.4689  -4.014
## 5%|10%.type_obesity3    -21.0511 12665.6882  -0.002
## 10%|15%.type_obesity3     5.7042 16494.5757   0.000
anova(fit_BMI2,fit_BMInom2)
## Likelihood ratio tests of cumulative link models:
##  
##             formula:                           nominal:      link:  threshold:
## fit_BMI2    atelectasis_percent ~ type_obesity ~1            loglog flexible  
## fit_BMInom2 atelectasis_percent ~ 1            ~type_obesity loglog flexible  
## 
##             no.par    AIC  logLik LR.stat df Pr(>Chisq)  
## fit_BMI2         5 363.66 -176.83                        
## fit_BMInom2      9 360.05 -171.03  11.608  4    0.02052 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The model with nominal effects has a marginally lower AIC (360.05) than the ordinal model (363.66) p=0.02. Proportional odds assumption not met. However, nominal model had problem with convergence (unstable estimates).

One alternative would be to fully model through a multinomial logistic regression model. As already shown, nominal models have failed to converge and have reduced statistical power.

Other alternatives would be to “ignore non-proportional odds, knowing that the odds ratio may still be very meaningful, or fit a partial proportional odds model or constrained partial PO model using either Bayesian modeling with blrm or using a frequentist procedure with the VGAM package vglm function” REF1 plus REF2.

Despite not meeting proportional odds assumption, I will present results of univariable and multivariable ordinal regression models to present first-order effects since this model is parsimonious and easier to understand. I will then try to create a partial PO model to further characterize and understand exposure-effect relationships.

Covariates univariable models

Age

fit_age <- clm(atelectasis_percent~age,
               data = datamodel, 
               link = "loglog", threshold = "flexible")

summary(fit_age)
## formula: atelectasis_percent ~ age
## data:    datamodel
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -271.61 559.21 9(2)  1.63e-13 2.2e+05
## 
## Coefficients:
##     Estimate Std. Error z value Pr(>|z|)
## age -0.00832    0.01148  -0.725    0.469
## 
## Threshold coefficients:
##           Estimate Std. Error z value
## 0%|2.5%     0.5956     0.4694   1.269
## 2.5%|5%     0.7814     0.4715   1.657
## 5%|7.5%     1.0578     0.4753   2.226
## 7.5%|10%    2.1455     0.5083   4.221
## 10%|12.5%   2.5384     0.5318   4.773
## 12.5%|15%   2.6206     0.5378   4.873
## 15%|17.5%   3.0351     0.5751   5.278
nominal_test(fit_age)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ age
##        Df  logLik    AIC LRT Pr(>Chi)
## <none>    -271.61 559.21             
## age
scale_test(fit_age)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ age
##        Df  logLik    AIC     LRT Pr(>Chi)
## <none>    -271.61 559.21                 
## age     1 -271.32 560.65 0.56311    0.453

Proportional odds assumption not showing either. Perhaps this is a problem due to centering. Will center and re-assess.

datamodel <- datamodel %>% mutate(agec = age/mean(age))
fit_age2 <- clm(atelectasis_percent~agec,
                data = datamodel, 
                link = "loglog", threshold = "flexible")

summary(fit_age2)
## formula: atelectasis_percent ~ agec
## data:    datamodel
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -271.61 559.21 8(2)  2.87e-08 1.5e+03
## 
## Coefficients:
##      Estimate Std. Error z value Pr(>|z|)
## agec  -0.3350     0.4621  -0.725    0.469
## 
## Threshold coefficients:
##           Estimate Std. Error z value
## 0%|2.5%     0.5956     0.4694   1.269
## 2.5%|5%     0.7814     0.4715   1.657
## 5%|7.5%     1.0578     0.4753   2.226
## 7.5%|10%    2.1455     0.5083   4.221
## 10%|12.5%   2.5384     0.5318   4.773
## 12.5%|15%   2.6206     0.5378   4.873
## 15%|17.5%   3.0351     0.5751   5.278
nominal_test(fit_age2)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ agec
##        Df  logLik    AIC LRT Pr(>Chi)
## <none>    -271.61 559.21             
## agec
scale_test(fit_age2)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ agec
##        Df  logLik    AIC     LRT Pr(>Chi)
## <none>    -271.61 559.21                 
## agec    1 -271.32 560.65 0.56311    0.453

Still not shown. Nonetheless, proportional odds assumption is not quite as relevant for covariates. Will assess remaining variables to decide best approach.

uni_age <- tidy(fit_age, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
   cols_label(conf.low = "95%CI")

uni_age
term estimate 95%CI
0%|2.5% 1.81 0.72—4.55
2.5%|5% 2.18 0.87—5.50
5%|7.5% 2.88 1.13—7.31
7.5%|10% 8.55 3.16—23.15
10%|12.5% 12.66 4.46—35.90
12.5%|15% 13.74 4.79—39.44
15%|17.5% 20.80 6.74—64.21
age 0.99 0.97—1.01

Sex

fit_sex <- clm(atelectasis_percent~sex,
               data = datamodel, 
               link = "loglog", threshold = "flexible")

summary(fit_sex)
## formula: atelectasis_percent ~ sex
## data:    datamodel
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -270.89 557.78 8(2)  2.12e-08 2.5e+02
## 
## Coefficients:
##        Estimate Std. Error z value Pr(>|z|)
## sexMan   0.5058     0.3396   1.489    0.136
## 
## Threshold coefficients:
##           Estimate Std. Error z value
## 0%|2.5%     0.9816     0.1228   7.994
## 2.5%|5%     1.1679     0.1314   8.889
## 5%|7.5%     1.4453     0.1463   9.879
## 7.5%|10%    2.5366     0.2346  10.810
## 10%|12.5%   2.9300     0.2817  10.400
## 12.5%|15%   3.0122     0.2929  10.285
## 15%|17.5%   3.4264     0.3570   9.598
nominal_test(fit_sex)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ sex
##        Df  logLik    AIC LRT Pr(>Chi)
## <none>    -270.89 557.78             
## sex
scale_test(fit_sex)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ sex
##        Df  logLik    AIC       LRT Pr(>Chi)
## <none>    -270.89 557.78                   
## sex     1 -270.88 559.77 0.0099172   0.9207

Does not hold for sex either.

uni_sex <- tidy(fit_sex, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
  cols_label(conf.low = "95%CI")

 uni_sex
term estimate 95%CI
0%|2.5% 2.67 2.10—3.39
2.5%|5% 3.22 2.49—4.16
5%|7.5% 4.24 3.19—5.65
7.5%|10% 12.64 7.98—20.02
10%|12.5% 18.73 10.78—32.53
12.5%|15% 20.33 11.45—36.10
15%|17.5% 30.77 15.28—61.93
sexMan 1.66 0.85—3.23

Sleep Apnea

fit_OSA <- clm(atelectasis_percent~sleep_apnea,
               data = datamodel, link = "loglog", 
               threshold = "flexible")

summary(fit_OSA)
## formula: atelectasis_percent ~ sleep_apnea
## data:    datamodel
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -256.47 528.94 8(2)  2.23e-07 2.2e+02
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## sleep_apneaYes   1.8657     0.2831    6.59 4.39e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##           Estimate Std. Error z value
## 0%|2.5%     1.1260     0.1296   8.689
## 2.5%|5%     1.3381     0.1407   9.507
## 5%|7.5%     1.6418     0.1577  10.412
## 7.5%|10%    2.7962     0.2471  11.316
## 10%|12.5%   3.1992     0.2929  10.923
## 12.5%|15%   3.2829     0.3037  10.809
## 15%|17.5%   3.7033     0.3663  10.109
nominal_test(fit_OSA)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ sleep_apnea
##             Df  logLik    AIC LRT Pr(>Chi)
## <none>         -256.47 528.94             
## sleep_apnea
scale_test(fit_OSA)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ sleep_apnea
##             Df  logLik    AIC    LRT Pr(>Chi)
## <none>         -256.47 528.94                
## sleep_apnea  1 -255.84 529.67 1.2653   0.2606

Does not hold for OSA either.

uni_OSA <- tidy(fit_OSA, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2)  %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
  cols_label(conf.low = "95%CI")

uni_OSA
term estimate 95%CI
0%|2.5% 3.08 2.39—3.97
2.5%|5% 3.81 2.89—5.02
5%|7.5% 5.16 3.79—7.03
7.5%|10% 16.38 10.09—26.59
10%|12.5% 24.51 13.81—43.52
12.5%|15% 26.65 14.70—48.34
15%|17.5% 40.58 19.79—83.20
sleep_apneaYes 6.46 3.71—11.25

Ordinal (collapsed factors) Logistic Regression

Since proportional odds not met for prior model with multiple ordered categories, I will use the priorly collapsed categories to model since AIC was also much lower when including collapsed factors. Perhaps this solves problems in prior models?

Univariable models

Type obesity (ordinal model created before)

uni_BMI <- tidy(fit_BMI2, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
   cols_label(conf.low = "95%CI")

uni_BMI
term estimate 95%CI
0%|5% 9.86 4.43—21.95
5%|10% 40.64 16.61—99.44
10%|15% 65.83 25.22—171.82
type_obesity2 1.61 0.56—4.65
type_obesity3 5.55 2.38—12.94

Age

fit_age <- clm(atelectasis_percent~age,
               data = data_prop, 
               link = "loglog", threshold = "flexible")

summary(fit_age)
## formula: atelectasis_percent ~ age
## data:    data_prop
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -191.67 391.33 8(0)  1.69e-08 9.0e+04
## 
## Coefficients:
##      Estimate Std. Error z value Pr(>|z|)
## age -0.009863   0.012461  -0.792    0.429
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     0.7210     0.5067   1.423
## 5%|10%    2.0847     0.5415   3.850
## 10%|15%   2.5600     0.5691   4.498
nominal_test(fit_age)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ age
##        Df  logLik    AIC LRT Pr(>Chi)
## <none>    -191.67 391.33             
## age
scale_test(fit_age)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ age
##        Df  logLik    AIC      LRT Pr(>Chi)
## <none>    -191.67 391.33                  
## age     1 -191.64 393.28 0.052727   0.8184

Proportional odds assumption not showing either. Will try centered variable.

data_prop <- data_prop %>% mutate(agec = age/mean(age))
fit_age2 <- clm(atelectasis_percent~agec,
                data = data_prop, 
                link = "loglog", threshold = "flexible")

summary(fit_age2)
## formula: atelectasis_percent ~ agec
## data:    data_prop
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -191.67 391.33 8(0)  1.69e-08 1.5e+02
## 
## Coefficients:
##      Estimate Std. Error z value Pr(>|z|)
## agec  -0.3971     0.5017  -0.792    0.429
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     0.7210     0.5067   1.423
## 5%|10%    2.0847     0.5415   3.850
## 10%|15%   2.5600     0.5691   4.498
nominal_test(fit_age2)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ agec
##        Df  logLik    AIC LRT Pr(>Chi)
## <none>    -191.67 391.33             
## agec
scale_test(fit_age2)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ agec
##        Df  logLik    AIC      LRT Pr(>Chi)
## <none>    -191.67 391.33                  
## agec    1 -191.64 393.28 0.052727   0.8184

Still not shown. Nonetheless, proportional odds assumption is not quite as relevant for covariates. Will assess remaining variables to decide best approach.

uni_age <- tidy(fit_age, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
   cols_label(conf.low = "95%CI")

uni_age
term estimate 95%CI
0%|5% 2.06 0.76—5.55
5%|10% 8.04 2.78—23.24
10%|15% 12.94 4.24—39.47
age 0.99 0.97—1.01

Sex

fit_sex <- clm(atelectasis_percent~sex,
               data = data_prop, 
               link = "loglog", threshold = "flexible")

summary(fit_sex)
## formula: atelectasis_percent ~ sex
## data:    data_prop
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -191.03 390.06 8(0)  1.05e-08 1.6e+01
## 
## Coefficients:
##        Estimate Std. Error z value Pr(>|z|)
## sexMan   0.5297     0.3602   1.471    0.141
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     1.1720     0.1329   8.817
## 5%|10%    2.5403     0.2356  10.784
## 10%|15%   3.0162     0.2937  10.271
nominal_test(fit_sex)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ sex
##        Df  logLik    AIC    LRT Pr(>Chi)
## <none>    -191.03 390.06                
## sex     2 -190.33 392.66 1.3989   0.4969
scale_test(fit_sex)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ sex
##        Df  logLik    AIC      LRT Pr(>Chi)
## <none>    -191.03 390.06                  
## sex     1 -190.98 391.97 0.091001   0.7629

Sex OK.

uni_sex <- tidy(fit_sex, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
  cols_label(conf.low = "95%CI")

 uni_sex
term estimate 95%CI
0%|5% 3.23 2.49—4.19
5%|10% 12.68 7.99—20.12
10%|15% 20.41 11.48—36.30
sexMan 1.70 0.84—3.44

Sleep Apnea

fit_OSA <- clm(atelectasis_percent~sleep_apnea,
               data = data_prop, 
               link = "loglog", threshold = "flexible")

summary(fit_OSA)
## formula: atelectasis_percent ~ sleep_apnea
## data:    data_prop
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -181.28 370.56 8(0)  8.01e-11 1.6e+01
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## sleep_apneaYes    1.684      0.312   5.398 6.74e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     1.2989     0.1390   9.342
## 5%|10%    2.7400     0.2460  11.138
## 10%|15%   3.2257     0.3029  10.649
nominal_test(fit_OSA)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ sleep_apnea
##             Df  logLik    AIC     LRT Pr(>Chi)
## <none>         -181.28 370.56                 
## sleep_apnea  2 -181.20 374.41 0.14765   0.9288
scale_test(fit_OSA)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ sleep_apnea
##             Df  logLik    AIC     LRT Pr(>Chi)
## <none>         -181.28 370.56                 
## sleep_apnea  1 -181.22 372.44 0.11567   0.7338

OSA OK.

uni_OSA <- tidy(fit_OSA, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2)  %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
  cols_label(conf.low = "95%CI")

uni_OSA
term estimate 95%CI
0%|5% 3.67 2.79—4.81
5%|10% 15.49 9.56—25.08
10%|15% 25.17 13.90—45.58
sleep_apneaYes 5.39 2.92—9.93

This modelling approach with collapsed factors is way more stable. Thus, will proceed with this modelling approach.

Multivariable model

fit_multi <- clm(atelectasis_percent ~ type_obesity + age + sex + sleep_apnea,
                 data = data_prop, 
                 link = "loglog", threshold = "flexible")

summary(fit_multi)
## formula: atelectasis_percent ~ type_obesity + age + sex + sleep_apnea
## data:    data_prop
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -167.75 351.50 8(0)  8.07e-12 1.7e+05
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## type_obesity2   0.392239   0.542266   0.723 0.469475    
## type_obesity3   1.632907   0.438320   3.725 0.000195 ***
## age            -0.002986   0.013200  -0.226 0.821015    
## sexMan         -0.482583   0.429310  -1.124 0.260975    
## sleep_apneaYes  1.697836   0.370437   4.583 4.58e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     2.2347     0.6855   3.260
## 5%|10%    3.7402     0.7201   5.194
## 10%|15%   4.2325     0.7428   5.698

Convergence:

convergence(fit_multi)
##  nobs logLik  niter max.grad cond.H  logLik.Error
##  236  -167.75 8(0)  8.07e-12 1.7e+05 <1e-10      
## 
##                 Estimate Std.Err  Gradient     Error Cor.Dec Sig.Dig
## 0%|5%           2.234661  0.6855 -5.33e-15 -2.17e-15      14      15
## 5%|10%          3.740199  0.7201  8.02e-12  5.94e-15      13      14
## 10%|15%         4.232542  0.7428 -8.07e-12 -2.70e-13      12      13
## type_obesity2   0.392239  0.5423  2.94e-15 -1.28e-16      15      15
## type_obesity3   1.632907  0.4383  5.42e-14 -2.58e-15      14      15
## age            -0.002986  0.0132  2.65e-12 -4.02e-17      16      14
## sexMan         -0.482583  0.4293  1.62e-14  1.80e-15      14      14
## sleep_apneaYes  1.697836  0.3704  3.22e-14 -6.86e-15      13      14
## 
## Eigen values of Hessian:
## 1.056e+05 7.537e+01 2.777e+01 1.806e+01 1.239e+01 4.400e+00 3.547e+00 6.047e-01 
## 
## Convergence message from clm:
## (0) successful convergence 
## In addition: Absolute and relative convergence criteria were met

Check proportional odds assumption:

nominal_test(fit_multi)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ type_obesity + age + sex + sleep_apnea
##              Df  logLik    AIC     LRT Pr(>Chi)
## <none>          -167.75 351.50                 
## type_obesity                                   
## age                                            
## sex           2 -167.03 354.05 1.44341   0.4859
## sleep_apnea   2 -167.64 355.28 0.21756   0.8969

Proportional odds OK for all variables except obesity type (as already expected) and age, but proportional odds assumption is not as relevant for covariates.

Check for scale effects:

scale_test(fit_multi)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ type_obesity + age + sex + sleep_apnea
##              Df  logLik    AIC     LRT Pr(>Chi)
## <none>          -167.75 351.50                 
## type_obesity  2 -166.57 353.14 2.35958   0.3073
## age           1 -167.69 353.39 0.10523   0.7456
## sex           1 -167.45 352.90 0.59651   0.4399
## sleep_apnea   1 -167.75 353.50 0.00001   0.9973

No scale effects, good news.

Summarize results in table:

tidy(fit_multi, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
  cols_label(conf.low = "95%CI")
term estimate 95%CI
0%|5% 9.34 2.44—35.81
5%|10% 42.11 10.26—172.72
10%|15% 68.89 16.07—295.41
type_obesity2 1.48 0.51—4.28
type_obesity3 5.12 2.17—12.09
age 1.00 0.97—1.02
sexMan 0.62 0.27—1.43
sleep_apneaYes 5.46 2.64—11.29

Partial proportional odds model:

As mentioned before, a partial proportional odds model could be useful in understanding patterns leading to not meeting proportional odds assumption. Thus, I will fit a partial proportional odds model.

The loglog function remains the better function to fit a model with nominal component for type_obesity.

data_prop %>%
  nest(data = everything()) %>%
 crossing(
    link = c("logit", "probit", "cloglog", "loglog", "cauchit"),
    threshold = c("flexible", "equidistant")
  ) %>%
  mutate(
    mod = map2(
      data, link,
      ~clm(atelectasis_percent ~ sex + age + sleep_apnea, nominal = ~ type_obesity,
        data = .x, link = .y
      )
    )
  ) %>%
  mutate(
    mod_summary = map(
      mod,
      ~glance(.x) %>% mutate(logLik = as.numeric(logLik))
    )
  ) %>%
  unnest(mod_summary) %>%
  dplyr::select(link, threshold, logLik, edf, AIC, BIC) %>%
  arrange(logLik) %>%
  gt()
link threshold logLik edf AIC BIC
cauchit equidistant -164.5107 12 353.0215 394.5875
cauchit flexible -164.5107 12 353.0215 394.5875
cloglog equidistant -163.4546 12 350.9092 392.4752
cloglog flexible -163.4546 12 350.9092 392.4752
logit equidistant -162.2484 12 348.4968 390.0628
logit flexible -162.2484 12 348.4968 390.0628
probit equidistant -162.2038 12 348.4076 389.9736
probit flexible -162.2038 12 348.4076 389.9736
loglog equidistant -161.8385 12 347.6771 389.2431
loglog flexible -161.8385 12 347.6771 389.2431

Univariable model

fit_BMIpartuni <- vglm(atelectasis_percent ~ type_obesity, 
                       data=data_prop, link="loglog",
                       family=cumulative(parallel=FALSE~type_obesity, reverse = TRUE)
                       )

summary(fit_BMIpartuni)
## 
## Call:
## vglm(formula = atelectasis_percent ~ type_obesity, family = cumulative(parallel = FALSE ~ 
##     type_obesity, reverse = TRUE), data = data_prop, link = "loglog")
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1    -2.2336     0.4296  -5.200 2.00e-07 ***
## (Intercept):2    -4.1109     1.0082  -4.078 4.55e-05 ***
## (Intercept):3    -4.1109     1.0082  -4.078 4.55e-05 ***
## type_obesity2:1   0.5064     0.5760   0.879   0.3793    
## type_obesity2:2   0.8722     1.2394   0.704   0.4816    
## type_obesity2:3   0.1596     1.4268   0.112   0.9109    
## type_obesity3:1   1.9507     0.4672   4.176 2.97e-05 ***
## type_obesity3:2   2.2295     1.0433   2.137   0.0326 *  
## type_obesity3:3   1.7039     1.0608   1.606   0.1082    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3]), 
## logitlink(P[Y>=4])
## 
## Residual deviance: 352.2895 on 699 degrees of freedom
## 
## Log-likelihood: -176.1448 on 699 degrees of freedom
## 
## Number of Fisher scoring iterations: 6 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):3'
## 
## 
## Exponentiated coefficients:
## type_obesity2:1 type_obesity2:2 type_obesity2:3 type_obesity3:1 type_obesity3:2 
##        1.659259        2.392157        1.173077        7.033816        9.295238 
## type_obesity3:3 
##        5.495495
confint_BMIpartuni <- round(exp(confintvglm(fit_BMIpartuni,method = "wald")),2)
confint_BMIpartuni
##                 2.5 % 97.5 %
## (Intercept):1    0.05   0.25
## (Intercept):2    0.00   0.12
## (Intercept):3    0.00   0.12
## type_obesity2:1  0.54   5.13
## type_obesity2:2  0.21  27.15
## type_obesity2:3  0.07  19.22
## type_obesity3:1  2.82  17.57
## type_obesity3:2  1.20  71.83
## type_obesity3:3  0.69  43.95

Multivariable model

Fit a model that allows proportional odds for all explanatory variables, except type obesity:

fit_BMIpart <- vglm(atelectasis_percent ~ type_obesity+sleep_apnea+sex+age, 
                    data=data_prop, 
                    link="loglog", 
                    family=cumulative(parallel=FALSE~type_obesity, reverse = TRUE)
                    )

summary(fit_BMIpart)
## 
## Call:
## vglm(formula = atelectasis_percent ~ type_obesity + sleep_apnea + 
##     sex + age, family = cumulative(parallel = FALSE ~ type_obesity, 
##     reverse = TRUE), data = data_prop, link = "loglog")
## 
## Coefficients: 
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1   -2.185439   0.799026  -2.735  0.00624 ** 
## (Intercept):2   -4.197328   1.255720  -3.343  0.00083 ***
## (Intercept):3   -4.197329   1.255721  -3.343  0.00083 ***
## type_obesity2:1  0.524960   0.592935   0.885  0.37596    
## type_obesity2:2  0.973580   1.283046   0.759  0.44797    
## type_obesity2:3  0.231191   1.459054   0.158  0.87410    
## type_obesity3:1  1.905298   0.486965   3.913 9.13e-05 ***
## type_obesity3:2  2.162276   1.105718   1.956  0.05052 .  
## type_obesity3:3  1.610209   1.123131   1.434  0.15166    
## sleep_apneaYes   2.074173   0.527271   3.934 8.36e-05 ***
## sexMan          -0.387067   0.550704  -0.703  0.48214    
## age             -0.004762   0.015996  -0.298  0.76592    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3]), 
## logitlink(P[Y>=4])
## 
## Residual deviance: 335.3433 on 696 degrees of freedom
## 
## Log-likelihood: -167.6716 on 696 degrees of freedom
## 
## Number of Fisher scoring iterations: 15 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):2', '(Intercept):3'
## 
## 
## Exponentiated coefficients:
## type_obesity2:1 type_obesity2:2 type_obesity2:3 type_obesity3:1 type_obesity3:2 
##       1.6903912       2.6474060       1.2601005       6.7214083       8.6908981 
## type_obesity3:3  sleep_apneaYes          sexMan             age 
##       5.0038569       7.9579648       0.6790456       0.9952492
AIC(fit_BMIpart)
## [1] 359.3433
confint_BMIpart <- round(exp(confintvglm(fit_BMIpart,method = "wald")),2)
confint_BMIpart
##                 2.5 % 97.5 %
## (Intercept):1    0.02   0.54
## (Intercept):2    0.00   0.18
## (Intercept):3    0.00   0.18
## type_obesity2:1  0.53   5.40
## type_obesity2:2  0.21  32.73
## type_obesity2:3  0.07  22.00
## type_obesity3:1  2.59  17.46
## type_obesity3:2  1.00  75.90
## type_obesity3:3  0.55  45.22
## sleep_apneaYes   2.83  22.37
## sexMan           0.23   2.00
## age              0.96   1.03

Model SpO2

The SpO2 variable does not have a normal distribution. Furthermore, the distance between 1% increases in SpO2 cannot be considered equidistant increases since values are determined from the S-shaped curve of hemoglobin saturation. This is the reason why the distribution of SpO2 is negatively skewed, with upper values reaching the saturation point of the hemoglobin curve.

Therefore, modelling SpO2 as a linear term could be potentially misleading. Nonetheless, a model assuming a gaussian distribution for SpO2 may potentially be easier to understand and communicate.

Thus, I will first model SpO2 assuming a gaussian distribution and I will then apply a fractional regression model which is more appropriate for the distribution of this variable. If assuming a gaussian distribution does not change the conclusions with respect to the more appropriate fractional model, then I will report the earlier. If results are discordant, then I will report the fractional model, instead.

Model assuming gaussian distribution

Create variable with atelectasis percent as numeric to be able to model with smooth term:

datamodel <- datamodel %>% 
  mutate(atelectasis_smooth = as.numeric(atelectasis_percent))

First, I will fit an empty model

model <- gam(spo2_VPO ~ 1, 
           data=datamodel
           )
AIC_empty <- AIC(model)
R2_empty <- summary(model)$r.sq
dev_empty <- summary(model)$dev.expl

summary(model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ 1
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   94.996      0.178   533.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =      0   Deviance explained =    0%
## GCV = 7.5084  Scale est. = 7.4766    n = 236

Fit again GAM with only smooth BMI term (the one used for Figure 1a):

model_BMI <- gam(spo2_VPO ~ s(BMI,k=5), 
                 data=datamodel
                 )

AIC_BMI <- AIC(model_BMI)
R2_BMI <- summary(model_BMI)$r.sq
dev_BMI <- summary(model_BMI)$dev.expl

summary(model_BMI)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.9958     0.1399   679.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.685  3.941 37.81  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.382   Deviance explained = 39.2%
## GCV = 4.7121  Scale est. = 4.6185    n = 236
plot(model_BMI)

gam.check(model_BMI)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 6 iterations.
## The RMS GCV score gradient at convergence was 4.669889e-06 .
## The Hessian was positive definite.
## Model rank =  5 / 5 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 3.69     1.1    0.90

Fit a model that only contains OSA:

model_OSA_only <- gam(spo2_VPO ~ sleep_apnea, 
                    data=datamodel
                    )

AIC_OSA_only <- AIC(model_OSA_only)
R2_OSA_only <- summary(model_OSA_only)$r.sq
dev_OSA_only <- summary(model_OSA_only)$dev.expl

summary(model_OSA_only)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ sleep_apnea
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     95.2661     0.1742 546.844  < 2e-16 ***
## sleep_apneaYes  -3.5438     0.6308  -5.618 5.46e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.115   Deviance explained = 11.9%
## GCV = 6.6727  Scale est. = 6.6162    n = 236
gam.check(model_OSA_only)

## 
## Method: GCV   Optimizer: magic
## Model required no smoothing parameter selectionModel rank =  2 / 2

Fit a model that only contains atelectasis percent as smooth term:

model_atel_smooth <- gam(spo2_VPO ~ s(atelectasis_smooth,k=4),
                       data=datamodel
                       )

AIC_atel_smooth <- AIC(model_atel_smooth)
R2_atel_smooth <- summary(model_atel_smooth)$r.sq
dev_atel_smooth <- summary(model_atel_smooth)$dev.expl

summary(model_atel_smooth)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(atelectasis_smooth, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   94.996      0.101   940.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value    
## s(atelectasis_smooth) 2.825  2.972 168.2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.678   Deviance explained = 68.2%
## GCV = 2.4451  Scale est. = 2.4055    n = 236
gam.check(model_atel_smooth)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 6 iterations.
## The RMS GCV score gradient at convergence was 5.121918e-06 .
## The Hessian was positive definite.
## Model rank =  4 / 4 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                         k'  edf k-index p-value
## s(atelectasis_smooth) 3.00 2.82    1.01    0.56

Compare against atelectasis_percent as categorical. Will not model as ordinal as coefficients are more difficult to interpret than nominal terms. Whether this variable is entered as nominal or ordinal does not change the results of models. This can be checked by converting atelectasis percent to ordered and back to unordered, which I did as corroboration.

datamodel <- datamodel %>% mutate(
  atelectasis_percent = factor(atelectasis_percent, ordered = FALSE)
  )
model_atel_only <- gam(spo2_VPO ~ atelectasis_percent, 
                     data=datamodel
                     )

AIC_atel_only <- AIC(model_atel_only)
R2_atel_only <- summary(model_atel_only)$r.sq
dev_atel_only <- summary(model_atel_only)$dev.expl

summary(model_atel_only)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ atelectasis_percent
## 
## Parametric coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               96.4780     0.1235 781.108  < 2e-16 ***
## atelectasis_percent2.5%   -2.3871     0.4856  -4.916 1.69e-06 ***
## atelectasis_percent5%     -3.9780     0.4342  -9.162  < 2e-16 ***
## atelectasis_percent7.5%   -4.5083     0.2979 -15.132  < 2e-16 ***
## atelectasis_percent10%    -5.4780     0.6477  -8.457 3.30e-15 ***
## atelectasis_percent12.5%  -4.4780     1.5623  -2.866  0.00454 ** 
## atelectasis_percent15%    -5.4780     0.7885  -6.948 3.85e-11 ***
## atelectasis_percent17.5%  -7.4780     0.5643 -13.251  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.676   Deviance explained = 68.5%
## GCV = 2.5108  Scale est. = 2.4257    n = 236
gam.check(model_atel_only)

## 
## Method: GCV   Optimizer: magic
## Model required no smoothing parameter selectionModel rank =  8 / 8

Modelling with smooth term offers improvement in aR2 and drop in AIC. Nonetheless, since this a non-linear relationship between BMI and the outcome is being studied, the explained deviance is more informative. Since categorical atelectasis percent explains greater deviance, I will continue using it as categorical for the subsequent models.

Fit model sBMI plus atelectasis percentage:

model_atel <- gam(spo2_VPO ~ s(BMI,k=5) + s(atelectasis_smooth,k=5), 
                data=datamodel
                )

AIC_atel <- AIC(model_atel)
R2_atel <- summary(model_atel)$r.sq
dev_atel <- summary(model_atel)$dev.expl

summary(model_atel)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 5) + s(atelectasis_smooth, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 94.99576    0.09935   956.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df      F p-value    
## s(BMI)                3.449  3.837  3.052  0.0322 *  
## s(atelectasis_smooth) 2.783  3.251 70.797  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.688   Deviance explained = 69.7%
## GCV = 2.4031  Scale est. = 2.3294    n = 236
plot(model_atel, all.terms=TRUE)

gam.check(model_atel)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 10 iterations.
## The RMS GCV score gradient at convergence was 2.779461e-07 .
## The Hessian was positive definite.
## Model rank =  9 / 9 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                         k'  edf k-index p-value
## s(BMI)                4.00 3.45    1.10    0.93
## s(atelectasis_smooth) 4.00 2.78    0.99    0.41

Model sleep apnea plus sBMI:

model_OSA <- gam(spo2_VPO ~ s(BMI,k=5) + sleep_apnea,
                 data=datamodel
                 )

AIC_OSA <- AIC(model_OSA)
R2_OSA <- summary(model_OSA)$r.sq
dev_OSA <- summary(model_OSA)$dev.expl

summary(model_OSA)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 5) + sleep_apnea
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     95.2065     0.1375 692.657  < 2e-16 ***
## sleep_apneaYes  -2.7625     0.5085  -5.432 1.41e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.807  3.977 37.54  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.451   Deviance explained = 46.2%
## GCV = 4.2072  Scale est. = 4.1037    n = 236
plot(model_OSA, all.terms=TRUE)

gam.check(model_OSA)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 5 iterations.
## The RMS GCV score gradient at convergence was 5.372761e-06 .
## The Hessian was positive definite.
## Model rank =  6 / 6 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 3.81    1.14    0.97

Model sBMI + sleep apnea + atelectasis percent:

model_OSA_atel <- gam(spo2_VPO ~ s(BMI,k=5) +
                        sleep_apnea + s(atelectasis_smooth,k=5),
                      data=datamodel
                      )

AIC_OSA_atel <- AIC(model_OSA_atel)
R2_OSA_atel <- summary(model_OSA_atel)$r.sq
dev_OSA_atel <- summary(model_OSA_atel)$dev.expl

summary(model_OSA_atel)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 5) + sleep_apnea + s(atelectasis_smooth, 
##     k = 5)
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     95.0447     0.1039 914.572   <2e-16 ***
## sleep_apneaYes  -0.6418     0.4121  -1.557    0.121    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df      F p-value    
## s(BMI)                3.528  3.879  3.085   0.026 *  
## s(atelectasis_smooth) 2.750  3.217 56.814  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.69   Deviance explained =   70%
## GCV = 2.3998  Scale est. = 2.3156    n = 236
plot(model_OSA_atel, all.terms=TRUE)

gam.check(model_OSA_atel)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 9 iterations.
## The RMS GCV score gradient at convergence was 4.264197e-07 .
## The Hessian was positive definite.
## Model rank =  10 / 10 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                         k'  edf k-index p-value
## s(BMI)                4.00 3.53    1.11    0.94
## s(atelectasis_smooth) 4.00 2.75    0.98    0.38

Fit model adjusted for confounders:

model_adj <- gam(spo2_VPO ~ s(BMI,k=5) +
                   sex + age + sleep_apnea + hb + altitude_cat,
                 data=datamodel, na.action=na.omit
                 )


AIC_adj <- AIC(model_adj)
R2_adj <- summary(model_adj)$r.sq
dev_adj <- summary(model_adj)$dev.expl

summary(model_adj)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 5) + sex + age + sleep_apnea + hb + altitude_cat
## 
## Parametric coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   96.885653   1.710883  56.629  < 2e-16 ***
## sexMan                         0.610780   0.503589   1.213    0.226    
## age                           -0.006073   0.013711  -0.443    0.658    
## sleep_apneaYes                -2.903071   0.542254  -5.354 2.13e-07 ***
## hb                            -0.101724   0.114371  -0.889    0.375    
## altitude_catModerate altitude -0.103037   0.396523  -0.260    0.795    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.773  3.969 37.22  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.448   Deviance explained = 46.8%
## GCV =  4.305  Scale est. = 4.1253    n = 234
plot(model_adj, all.terms=TRUE)

gam.check(model_adj)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 6 iterations.
## The RMS GCV score gradient at convergence was 6.504339e-06 .
## The Hessian was positive definite.
## Model rank =  10 / 10 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 3.77    1.12    0.98

Fit adjusted model plus atelectasis percentage:

model_plus <- gam(spo2_VPO ~ s(BMI,k=5) +
                    sex + age + sleep_apnea + hb + altitude_cat +
                    atelectasis_percent,
                  data=datamodel, na.action=na.omit
                  )

AIC_plus <- AIC(model_plus)
R2_plus <- summary(model_plus)$r.sq
dev_plus <- summary(model_plus)$dev.expl

summary(model_plus)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## spo2_VPO ~ s(BMI, k = 5) + sex + age + sleep_apnea + hb + altitude_cat + 
##     atelectasis_percent
## 
## Parametric coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   96.295053   1.295969  74.304  < 2e-16 ***
## sexMan                         0.295662   0.385893   0.766  0.44440    
## age                           -0.002954   0.010466  -0.282  0.77803    
## sleep_apneaYes                -0.673227   0.443733  -1.517  0.13067    
## hb                             0.014394   0.087176   0.165  0.86901    
## altitude_catModerate altitude -0.173961   0.302243  -0.576  0.56550    
## atelectasis_percent2.5%       -2.148467   0.494381  -4.346 2.13e-05 ***
## atelectasis_percent5%         -3.784264   0.467908  -8.088 4.21e-14 ***
## atelectasis_percent7.5%       -4.164107   0.365708 -11.386  < 2e-16 ***
## atelectasis_percent10%        -5.029637   0.714830  -7.036 2.53e-11 ***
## atelectasis_percent12.5%      -4.220356   1.582238  -2.667  0.00822 ** 
## atelectasis_percent15%        -5.077611   0.817844  -6.209 2.67e-09 ***
## atelectasis_percent17.5%      -6.053448   0.777222  -7.789 2.73e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value  
## s(BMI) 3.268  3.733 2.289  0.0844 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.686   Deviance explained = 70.6%
## GCV = 2.5213  Scale est. = 2.346     n = 234
plot(model_plus, all.terms=TRUE)

gam.check(model_plus)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 4 iterations.
## The RMS GCV score gradient at convergence was 1.26394e-07 .
## The Hessian was positive definite.
## Model rank =  17 / 17 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 3.27    1.06    0.80

Build a dataframe to compare models:

models <- data.frame(Model = c("empty",
                               "sBMI",
                               "OSA_only",
                               "atel_only",
                               "s(atel)",
                               "sBMI_atel",
                               "sBMI_OSA",
                               "sBMI_atel_OSA",
                               "adjusted",
                               "adjusted_plus_atel"),
                     AIC = c(AIC_empty,
                             AIC_BMI,
                             AIC_OSA_only,
                             AIC_atel_only,
                             AIC_atel_smooth,
                             AIC_atel,
                             AIC_OSA,
                             AIC_OSA_atel,
                             AIC_adj,
                             AIC_plus),
                     aR2 = c(R2_empty,
                             R2_BMI,
                             R2_OSA_only,
                             R2_atel_only,
                             R2_atel_smooth,
                             R2_atel,
                             R2_OSA,
                             R2_OSA_atel,
                             R2_adj,
                             R2_plus),
                     dev = c(dev_empty,
                             dev_BMI,
                             dev_OSA_only,
                             dev_atel_only,
                             dev_atel_smooth,
                             dev_atel,
                             dev_OSA,
                             dev_OSA_atel,
                             dev_adj,
                             dev_plus)
                     )

Sort by AIC

models %>% arrange(AIC)
##                 Model       AIC       aR2       dev
## 1       sBMI_atel_OSA  878.0318 0.6902833 0.6998746
## 2           sBMI_atel  878.4249 0.6884364 0.6966988
## 3  adjusted_plus_atel  881.2705 0.6858420 0.7064277
## 4             s(atel)  882.6807 0.6782654 0.6821328
## 5           atel_only  888.7213 0.6755647 0.6852288
## 6            adjusted 1007.2337 0.4475739 0.4683733
## 7            sBMI_OSA 1010.6786 0.4511281 0.4623563
## 8                sBMI 1037.4747 0.3822678 0.3919547
## 9            OSA_only 1119.6558 0.1150826 0.1188482
## 10              empty 1147.5158 0.0000000 0.0000000

Sort by deviance

models <- models %>% mutate(aR2 = round(aR2,3)*100,
                            dev = round(dev,3)*100
                            )
models %>% arrange(desc(dev))
##                 Model       AIC  aR2  dev
## 1  adjusted_plus_atel  881.2705 68.6 70.6
## 2       sBMI_atel_OSA  878.0318 69.0 70.0
## 3           sBMI_atel  878.4249 68.8 69.7
## 4           atel_only  888.7213 67.6 68.5
## 5             s(atel)  882.6807 67.8 68.2
## 6            adjusted 1007.2337 44.8 46.8
## 7            sBMI_OSA 1010.6786 45.1 46.2
## 8                sBMI 1037.4747 38.2 39.2
## 9            OSA_only 1119.6558 11.5 11.9
## 10              empty 1147.5158  0.0  0.0

Sort by adjusted R2

models %>% arrange(desc(aR2))
##                 Model       AIC  aR2  dev
## 1       sBMI_atel_OSA  878.0318 69.0 70.0
## 2           sBMI_atel  878.4249 68.8 69.7
## 3  adjusted_plus_atel  881.2705 68.6 70.6
## 4             s(atel)  882.6807 67.8 68.2
## 5           atel_only  888.7213 67.6 68.5
## 6            sBMI_OSA 1010.6786 45.1 46.2
## 7            adjusted 1007.2337 44.8 46.8
## 8                sBMI 1037.4747 38.2 39.2
## 9            OSA_only 1119.6558 11.5 11.9
## 10              empty 1147.5158  0.0  0.0

Some conclusions from model assuming normal distribution for SpO2:
- Most of the effect of BMI was atenuated when including atelectasis percent. Nonetheless, a smooth term for BMI was still statistically significant in adjusted models. Will check if these conclusions hold in the more appropriate fractional model.
- There were problems with heteroskedasticity using Gaussian function, with greater error higher SpO2 values. Notably, error term at lower SpO2 values was low, meaning that the terms included in the model very likely explain most of the variability at low SpO2 values. This could make sense, as it is unlikely that normal SpO2 values will be explained by these factors. Thus, it may be a case of true heteroskedasticity. Will again check in fractional model.

Fractional regression function

Convert SpO2 to fractional values between 0 and 1 to model.

datamodel <- datamodel %>% mutate(spo2_fraction = spo2_VPO/100)

Empty model

First, I will fit an empty model

model<-gam(spo2_fraction~1,
           data=datamodel, 
           family = quasibinomial(link = logit)
           )

R2_empty <- summary(model)$r.sq
dev_empty <- summary(model)$dev.expl

summary(model)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ 1
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.94355    0.03744   78.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =      0   Deviance explained = -3.04e-13%
## GCV = 0.015586  Scale est. = 0.015728  n = 236

Smooth BMI

Fit again GAM with only smooth BMI term (the one used for Figure 1a):

model_BMI <- gam(spo2_fraction~s(BMI,k=5), 
                 data=datamodel, 
                 family = quasibinomial(link = logit)
                 )

R2_BMI <- summary(model_BMI)$r.sq
dev_BMI <- summary(model_BMI)$dev.expl

summary(model_BMI)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ s(BMI, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.98307    0.03217   92.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.312  3.737 32.29  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.379   Deviance explained = 32.9%
## GCV = 0.010767  Scale est. = 0.010884  n = 236
plot(model_BMI)

gam.check(model_BMI)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [2.084401e-08,2.084401e-08]
## (score 0.01076717 & scale 0.01088391).
## Hessian positive definite, eigenvalue range [3.213419e-05,3.213419e-05].
## Model rank =  5 / 5 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 3.31    1.09    0.88

OSA only

Fit a model that only contains OSA:

model_OSA_only<-gam(spo2_fraction~sleep_apnea,
                    data=datamodel,
                    family = quasibinomial(link = logit)
                    )

R2_OSA_only <- summary(model_OSA_only)$r.sq
dev_OSA_only <- summary(model_OSA_only)$dev.expl

summary(model_OSA_only)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ sleep_apnea
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.00191    0.03833  78.326  < 2e-16 ***
## sleep_apneaYes -0.59672    0.10971  -5.439 1.34e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.115   Deviance explained = 10.3%
## GCV = 0.014099  Scale est. = 0.014441  n = 236
gam.check(model_OSA_only)

## 
## Method: GCV   Optimizer: outer newton
## Model required no smoothing parameter selectionModel rank =  2 / 2

Atelectasis percent

Fit a model that only contains atelectasis percent:

model_atel_only<-gam(spo2_fraction ~ atelectasis_percent,
                     data=datamodel, 
                     family = quasibinomial(link = logit)
                     )

R2_atel_only <- summary(model_atel_only)$r.sq
dev_atel_only <- summary(model_atel_only)$dev.expl

summary(model_atel_only)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ atelectasis_percent
## 
## Parametric coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.31028    0.03416  96.903  < 2e-16 ***
## atelectasis_percent2.5%  -0.54251    0.10713  -5.064 8.45e-07 ***
## atelectasis_percent5%    -0.79798    0.08751  -9.118  < 2e-16 ***
## atelectasis_percent7.5%  -0.87205    0.06127 -14.233  < 2e-16 ***
## atelectasis_percent10%   -0.99665    0.11831  -8.424 4.10e-15 ***
## atelectasis_percent12.5% -0.86794    0.29467  -2.945  0.00356 ** 
## atelectasis_percent15%   -0.99665    0.14287  -6.976 3.26e-11 ***
## atelectasis_percent17.5% -1.21954    0.09601 -12.703  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.676   Deviance explained = 62.6%
## GCV = 0.006189  Scale est. = 0.0063048  n = 236
gam.check(model_atel_only)

## 
## Method: GCV   Optimizer: outer newton
## Model required no smoothing parameter selectionModel rank =  8 / 8

s(BMI) + atelectasis percentage

Fit model sBMI plus atelectasis percentage:

model_atel<-gam(spo2_fraction ~ s(BMI,k=5) + atelectasis_percent,
                data=datamodel, 
                family = quasibinomial(link = logit)
                )

R2_atel <- summary(model_atel)$r.sq
dev_atel <- summary(model_atel)$dev.expl

summary(model_atel)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ s(BMI, k = 5) + atelectasis_percent
## 
## Parametric coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.29735    0.03566  92.459  < 2e-16 ***
## atelectasis_percent2.5%  -0.54196    0.10717  -5.057 8.77e-07 ***
## atelectasis_percent5%    -0.76229    0.09217  -8.271 1.13e-14 ***
## atelectasis_percent7.5%  -0.83200    0.06932 -12.002  < 2e-16 ***
## atelectasis_percent10%   -0.94286    0.12600  -7.483 1.59e-12 ***
## atelectasis_percent12.5% -0.80592    0.29893  -2.696  0.00754 ** 
## atelectasis_percent15%   -0.95486    0.14686  -6.502 4.99e-10 ***
## atelectasis_percent17.5% -1.12631    0.12209  -9.225  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value
## s(BMI) 1.005   1.01 1.553   0.215
## 
## R-sq.(adj) =  0.678   Deviance explained = 62.9%
## GCV = 0.0061984  Scale est. = 0.0063098  n = 236
plot(model_atel, all.terms=TRUE)

gam.check(model_atel)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-4.583064e-09,-4.583064e-09]
## (score 0.006198443 & scale 0.006309842).
## Hessian positive definite, eigenvalue range [5.485351e-09,5.485351e-09].
## Model rank =  12 / 12 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 1.01     1.1    0.94

OSA + BMI

Model sleep apnea plus sBMI:

model_OSA <- gam(spo2_fraction ~ s(BMI,k=5) + sleep_apnea, 
               data=datamodel, 
               family = quasibinomial(link = logit)
               )

R2_OSA <- summary(model_OSA)$r.sq
dev_OSA <- summary(model_OSA)$dev.expl

summary(model_OSA)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ s(BMI, k = 5) + sleep_apnea
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.02441    0.03253  92.982  < 2e-16 ***
## sleep_apneaYes -0.45014    0.09506  -4.735 3.82e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.557  3.883 30.07  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.425   Deviance explained = 38.8%
## GCV = 0.0099253  Scale est. = 0.010027  n = 236
plot(model_OSA, all.terms=TRUE)

sBMI + OSA + atelectasis percent

Model sBMI + sleep apnea + atelectasis percent:

model_OSA_atel <-gam(spo2_fraction ~ s(BMI,k=5) + sleep_apnea + 
                       atelectasis_percent,
                     data=datamodel, 
                     family = quasibinomial(link = logit)
                     )

R2_OSA_atel <- summary(model_OSA_atel)$r.sq
dev_OSA_atel <- summary(model_OSA_atel)$dev.expl

summary(model_OSA_atel)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ s(BMI, k = 5) + sleep_apnea + atelectasis_percent
## 
## Parametric coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.29754    0.03567  92.437  < 2e-16 ***
## sleep_apneaYes           -0.07729    0.07824  -0.988  0.32424    
## atelectasis_percent2.5%  -0.52072    0.10945  -4.758 3.49e-06 ***
## atelectasis_percent5%    -0.75059    0.09298  -8.073 4.07e-14 ***
## atelectasis_percent7.5%  -0.81492    0.07150 -11.398  < 2e-16 ***
## atelectasis_percent10%   -0.91640    0.12879  -7.115 1.46e-11 ***
## atelectasis_percent12.5% -0.80485    0.29904  -2.691  0.00765 ** 
## atelectasis_percent15%   -0.93410    0.14857  -6.287 1.65e-09 ***
## atelectasis_percent17.5% -1.10427    0.12460  -8.863 2.37e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df    F p-value
## s(BMI) 1.001  1.002 1.63   0.203
## 
## R-sq.(adj) =  0.679   Deviance explained = 63.1%
## GCV = 0.0062253  Scale est. = 0.0063143  n = 236
plot(model_OSA_atel, all.terms=TRUE)

gam.check(model_OSA_atel)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-6.474023e-09,-6.474023e-09]
## (score 0.006225264 & scale 0.006314294).
## Hessian positive definite, eigenvalue range [6.511298e-09,6.511298e-09].
## Model rank =  13 / 13 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##        k' edf k-index p-value
## s(BMI)  4   1     1.1    0.94

Adjusted model

Fit model adjusted for confounders:

model_adj<-gam(spo2_fraction~s(BMI,k=5)+sex+age+sleep_apnea+hb+altitude_cat,
               data=datamodel, 
               na.action=na.omit, 
               family = quasibinomial(link = logit)
               )

R2_adj <- summary(model_adj)$r.sq
dev_adj <- summary(model_adj)$dev.expl

summary(model_adj)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ s(BMI, k = 5) + sex + age + sleep_apnea + hb + 
##     altitude_cat
## 
## Parametric coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.38710    0.39225   8.635  1.1e-15 ***
## sexMan                         0.12972    0.11249   1.153    0.250    
## age                           -0.00121    0.00315  -0.384    0.701    
## sleep_apneaYes                -0.47773    0.10250  -4.661  5.4e-06 ***
## hb                            -0.02217    0.02645  -0.838    0.403    
## altitude_catModerate altitude -0.02806    0.08869  -0.316    0.752    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(BMI) 3.493  3.852 29.68  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.42   Deviance explained = 39.5%
## GCV = 0.010105  Scale est. = 0.010041  n = 234
plot(model_adj, all.terms=TRUE)

gam.check(model_adj)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [2.615768e-09,2.615768e-09]
## (score 0.01010485 & scale 0.01004117).
## Hessian positive definite, eigenvalue range [2.720599e-05,2.720599e-05].
## Model rank =  10 / 10 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 3.49     1.1    0.93

Adjusted + Atelectasis percent

Fit adjusted model plus atelectasis percentage:

model_plus<-gam(spo2_fraction ~ s(BMI,k=5) +
                  sex + age + sleep_apnea + hb + altitude_cat +
                  atelectasis_percent,
                data=datamodel, 
                na.action=na.omit, 
                family = quasibinomial(link = logit)
                )

R2_plus <- summary(model_plus)$r.sq
dev_plus <- summary(model_plus)$dev.expl

summary(model_plus)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## spo2_fraction ~ s(BMI, k = 5) + sex + age + sleep_apnea + hb + 
##     altitude_cat + atelectasis_percent
## 
## Parametric coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.2635864  0.3054348  10.685  < 2e-16 ***
## sexMan                         0.0488753  0.0900997   0.542  0.58805    
## age                           -0.0008131  0.0025857  -0.314  0.75348    
## sleep_apneaYes                -0.0902632  0.0857037  -1.053  0.29341    
## hb                             0.0048886  0.0207568   0.236  0.81402    
## altitude_catModerate altitude -0.0499119  0.0704348  -0.709  0.47931    
## atelectasis_percent2.5%       -0.5158425  0.1097937  -4.698 4.62e-06 ***
## atelectasis_percent5%         -0.7556435  0.0940532  -8.034 5.71e-14 ***
## atelectasis_percent7.5%       -0.8190971  0.0719975 -11.377  < 2e-16 ***
## atelectasis_percent10%        -0.9343044  0.1303743  -7.166 1.15e-11 ***
## atelectasis_percent12.5%      -0.8041808  0.3025669  -2.658  0.00844 ** 
## atelectasis_percent15%        -0.9382125  0.1503699  -6.239 2.23e-09 ***
## atelectasis_percent17.5%      -1.1090567  0.1276979  -8.685 8.69e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value
## s(BMI) 1.157  1.298 1.225   0.316
## 
## R-sq.(adj) =   0.68   Deviance explained = 63.8%
## GCV = 0.0062941  Scale est. = 0.0062863  n = 234
plot(model_plus, all.terms=TRUE)

gam.check(model_plus)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 3 iterations.
## Gradient range [-6.457525e-10,-6.457525e-10]
## (score 0.006294057 & scale 0.006286255).
## Hessian positive definite, eigenvalue range [9.039609e-07,9.039609e-07].
## Model rank =  17 / 17 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##          k'  edf k-index p-value
## s(BMI) 4.00 1.16    1.05    0.78

Build a dataframe to compare models:

models <- data.frame(Model = c("empty",
                               "sBMI",
                               "OSA_only",
                               "atel_only",
                               "sBMI_atel",
                               "sBMI_OSA",
                               "sBMI_atel_OSA",
                               "adjusted",
                               "adjusted_plus_atel"),
                     aR2 = c(R2_empty,
                             R2_BMI,
                             R2_OSA_only,
                             R2_atel_only,
                             R2_atel,
                             R2_OSA,
                             R2_OSA_atel,
                             R2_adj,
                             R2_plus),
                     dev = c(dev_empty,
                             dev_BMI,
                             dev_OSA_only,
                             dev_atel_only,
                             dev_atel,
                             dev_OSA,
                             dev_OSA_atel,
                             dev_adj,
                             dev_plus)
           )

Sort by deviance

models <- models %>% mutate(aR2 = round(aR2,3)*100,
                            dev = round(dev,3)*100
                            )
models %>% arrange(desc(dev))
##                Model  aR2  dev
## 1 adjusted_plus_atel 68.0 63.8
## 2      sBMI_atel_OSA 67.9 63.1
## 3          sBMI_atel 67.8 62.9
## 4          atel_only 67.6 62.6
## 5           adjusted 42.0 39.5
## 6           sBMI_OSA 42.5 38.8
## 7               sBMI 37.9 32.9
## 8           OSA_only 11.5 10.3
## 9              empty  0.0  0.0

Sort by R2

models %>% arrange(desc(aR2))
##                Model  aR2  dev
## 1 adjusted_plus_atel 68.0 63.8
## 2      sBMI_atel_OSA 67.9 63.1
## 3          sBMI_atel 67.8 62.9
## 4          atel_only 67.6 62.6
## 5           sBMI_OSA 42.5 38.8
## 6           adjusted 42.0 39.5
## 7               sBMI 37.9 32.9
## 8           OSA_only 11.5 10.3
## 9              empty  0.0  0.0

Table S3

Create table for models:

tab_BMI_only <- tbl_regression(model_BMI, 
                               exponentiate = TRUE, 
                               tidy_fun = gtsummary::tidy_gam)

tab_OSA_only <- tbl_regression(model_OSA_only, 
                               exponentiate = TRUE, 
                               tidy_fun = gtsummary::tidy_gam)

tab_atel_only <- tbl_regression(model_atel_only,
                                exponentiate = TRUE,
                                tidy_fun = gtsummary::tidy_gam) 

tab_OSA <- tbl_regression(model_OSA,
                          exponentiate = TRUE, 
                          tidy_fun = gtsummary::tidy_gam) 

tab_atel <- tbl_regression(model_atel, 
                           exponentiate = TRUE,
                           tidy_fun = gtsummary::tidy_gam)

tab_OSA_atel <- tbl_regression(model_OSA_atel,
                               exponentiate = TRUE, 
                               tidy_fun = gtsummary::tidy_gam)

tab_adj <- tbl_regression(model_adj, 
                          exponentiate = TRUE,
                          tidy_fun = gtsummary::tidy_gam)

tab_plus <- tbl_regression(model_plus,
                           exponentiate = TRUE, 
                           tidy_fun = gtsummary::tidy_gam)

tableS3 <- tbl_stack(
  list(tab_BMI_only,tab_OSA_only,tab_atel_only,tab_OSA,tab_atel,tab_OSA_atel,tab_adj,tab_plus),
  group_header = c("BMI only", "OSA only", "Atelectasis percent only", "BMI + OSA", "BMI + Atelectasis percent", "BMI + OSA + Atelectasis percent", "BMI + OSA + age + sex + hb + altitude","Fully adjusted model")
  ) 

tableS3
Characteristic OR1 95% CI1 p-value
BMI only
s(BMI)

<0.001
OSA only
sleep_apnea


    No
    Yes 0.55 0.44, 0.68 <0.001
Atelectasis percent only
atelectasis_percent


    0%
    2.5% 0.58 0.47, 0.72 <0.001
    5% 0.45 0.38, 0.53 <0.001
    7.5% 0.42 0.37, 0.47 <0.001
    10% 0.37 0.29, 0.47 <0.001
    12.5% 0.42 0.24, 0.75 0.004
    15% 0.37 0.28, 0.49 <0.001
    17.5% 0.30 0.24, 0.36 <0.001
BMI + OSA
sleep_apnea


    No
    Yes 0.64 0.53, 0.77 <0.001
s(BMI)

<0.001
BMI + Atelectasis percent
atelectasis_percent


    0%
    2.5% 0.58 0.47, 0.72 <0.001
    5% 0.47 0.39, 0.56 <0.001
    7.5% 0.44 0.38, 0.50 <0.001
    10% 0.39 0.30, 0.50 <0.001
    12.5% 0.45 0.25, 0.80 0.008
    15% 0.38 0.29, 0.51 <0.001
    17.5% 0.32 0.26, 0.41 <0.001
s(BMI)

0.2
BMI + OSA + Atelectasis percent
sleep_apnea


    No
    Yes 0.93 0.79, 1.08 0.3
atelectasis_percent


    0%
    2.5% 0.59 0.48, 0.74 <0.001
    5% 0.47 0.39, 0.57 <0.001
    7.5% 0.44 0.38, 0.51 <0.001
    10% 0.40 0.31, 0.51 <0.001
    12.5% 0.45 0.25, 0.80 0.008
    15% 0.39 0.29, 0.53 <0.001
    17.5% 0.33 0.26, 0.42 <0.001
s(BMI)

0.2
BMI + OSA + age + sex + hb + altitude
sex


    Woman
    Man 1.14 0.91, 1.42 0.3
Age 1.00 0.99, 1.00 0.7
sleep_apnea


    No
    Yes 0.62 0.51, 0.76 <0.001
Hemoglobin 0.98 0.93, 1.03 0.4
altitude_cat


    Low altitude
    Moderate altitude 0.97 0.82, 1.16 0.8
s(BMI)

<0.001
Fully adjusted model
sex


    Woman
    Man 1.05 0.88, 1.25 0.6
Age 1.00 0.99, 1.00 0.8
sleep_apnea


    No
    Yes 0.91 0.77, 1.08 0.3
Hemoglobin 1.00 0.96, 1.05 0.8
altitude_cat


    Low altitude
    Moderate altitude 0.95 0.83, 1.09 0.5
atelectasis_percent


    0%
    2.5% 0.60 0.48, 0.74 <0.001
    5% 0.47 0.39, 0.56 <0.001
    7.5% 0.44 0.38, 0.51 <0.001
    10% 0.39 0.30, 0.51 <0.001
    12.5% 0.45 0.25, 0.81 0.008
    15% 0.39 0.29, 0.53 <0.001
    17.5% 0.33 0.26, 0.42 <0.001
s(BMI)

0.3
1 OR = Odds Ratio, CI = Confidence Interval

Save Table S3

tableS3 %>%
  as_gt() %>%
  gt::gtsave(filename = "TableS3.docx", path = tabfolder)

Figure SpO2:

Figure 3a: sBMI

Check residuals:

draw(model_BMI,residuals=TRUE) 

Now, plot take the inverse logit function to assess partial effect on mean SpO2.

draw(model_BMI, 
     constant = coef(model_BMI)[1], 
     fun = inv_link(model_BMI)
     ) 

Draw a personalized plot:

plot3a <- draw(model_BMI, 
     constant = coef(model_BMI)[1], 
     fun = inv_link(model_BMI), 
     smooth_col = "cadetblue4"
     ) +
  scale_y_continuous(labels = scales::percent, limits=c(0.80,1)) +
  labs(x="Body mass index (kg/m²)", 
       y = "Partial effect (mean SpO2)", 
       title = "s(BMI)", 
       subtitle=paste0("Deviance explained:"," ",(round(dev_BMI,3)*100),"%"),
       tag="A",
       caption=NULL) + 
  theme_bw() +
  theme(panel.border = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

plot3a 

Figure 3b: sBMI_OSA

Check residuals:

draw(model_OSA,residuals=TRUE) 

Partial effect on mean SpO2:

plot3b <- draw(model_OSA, 
     constant = coef(model_OSA)[1], 
     fun = inv_link(model_OSA), 
     smooth_col = "cadetblue4"
     ) +
  scale_y_continuous(labels = scales::percent, limits=c(0.80,1)) +
  labs(x="Body mass index (kg/m²)", 
       y = "Partial effect (mean SpO2)", 
       title = "s(BMI) + OSA", 
       subtitle=paste0("Deviance explained:"," ",(round(dev_OSA,3)*100),"%"),
       tag="B",
       caption=NULL) + 
  theme_bw() +
  theme(panel.border = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

plot3b 

Figure 3c: sBMI_atel

Check residuals:

draw(model_atel,residuals=TRUE) 

Partial effect on mean SpO2:

plot3c <- draw(model_atel, 
     constant = coef(model_atel)[1], 
     fun = inv_link(model_atel), 
     smooth_col = "cadetblue4"
     ) +
  scale_y_continuous(labels = scales::percent, limits=c(0.80,1)) +
  labs(x="Body mass index (kg/m²)", 
       y = "Partial effect (mean SpO2)", 
       title = "s(BMI) + atelectasis percent", 
       subtitle=paste0("Deviance explained:"," ",(round(dev_atel,3)*100),"%"),
       tag="C",
       caption=NULL) + 
  theme_bw() +
  theme(panel.border = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

plot3c 

Figure 3d: fully adjusted

Check residuals:

draw(model_plus,residuals=TRUE) 

Partial effect on mean SpO2:

plot3d <- draw(model_plus, 
     constant = coef(model_plus)[1], 
     fun = inv_link(model_plus), 
     smooth_col = "cadetblue4"
     ) +
  scale_y_continuous(labels = scales::percent, limits=c(0.80,1)) +
  labs(x="Body mass index (kg/m²)", 
       y = "Partial effect (mean SpO2)", 
       title = "Fully adjusted model", 
       subtitle=paste0("Deviance explained:"," ",(round(dev_plus,3)*100),"%"),
       tag="D",
       caption=NULL) + 
  theme_bw() +
  theme(panel.border = element_blank(),
        panel.grid.minor = element_blank(), 
        axis.line = element_line(colour = "black"),
        axis.text.x = element_text(size=rel(1.2)), 
        axis.text.y = element_text(size=rel(1.2))
        )

plot3d 

Figure 3

figure3 <- grid.arrange(plot3a, plot3b, plot3c, plot3d)

Save figure:

ggsave("Figure3.png",plot=figure3,path=figfolder, width = 10,  height = 7, units = "in", dpi = 300)

Post-hoc analyses

In figure 2, it can be observed that SpO2 starts decreasing at BMIs above 40. Thus, by having used the WHO obesity class categories, detail on differences above BMI 40 for the extent of atelectasis percentage may have been lost. The WHO obesity class categories do not reflect the extent of variation in BMI observed in this sample of patients:
- Class 1, BMI [30,35): ~1/4 participants
- Class 2, BMI [35,40): ~1/4 participants - Class 3, BMI >40: ~1/2 of participants, with a median BMI above a 5 units range: 45.56.

Thus, creating subcategories within the class 3 obesity may allow to assess the impact of BMI increases above 40 on atelectasis percentage with more detail.

Create new variable with the following categories:
- BMI [30,35)
- BMI [35,40)
- BMI [40,45)
- BMI [44,50)
- BMI >50

data_prop <- data_prop %>% 
  mutate(BMI_breaks = cut(BMI,
                            breaks=c(30,35,40,45,50,80),
                            right=FALSE,
                            labels=c("[30,35)","[35,40)","[40,45)","[45,50)","≥50")
                            )
         )
attach(data_prop)
tab <- table(BMI_breaks)
tab
## BMI_breaks
## [30,35) [35,40) [40,45) [45,50)     ≥50 
##      63      53      57      31      32
prop <- round(prop.table(tab)*100,1)
prop
## BMI_breaks
## [30,35) [35,40) [40,45) [45,50)     ≥50 
##    26.7    22.5    24.2    13.1    13.6

Atelectasis - obesity breaks

Mean expected frequency:

mean_exp <- data_prop %>% summarize(mean_expected_freq = n()/(nlevels(BMI_breaks)*nlevels(atelectasis)))
mean_exp
##   mean_expected_freq
## 1               23.6
freq <- table(BMI_breaks, atelectasis)
freq
##           atelectasis
## BMI_breaks Yes No
##    [30,35)   8 55
##    [35,40)  15 38
##    [40,45)   7 50
##    [45,50)  15 16
##    ≥50      32  0
round(prop.table(freq),4)
##           atelectasis
## BMI_breaks    Yes     No
##    [30,35) 0.0339 0.2331
##    [35,40) 0.0636 0.1610
##    [40,45) 0.0297 0.2119
##    [45,50) 0.0636 0.0678
##    ≥50     0.1356 0.0000

Mosaic Plot

data_prop %>% mutate(atelectasis = fct_relevel(atelectasis, "No", "Yes")) %>%
ggplot() +
  geom_mosaic(aes(x = product(atelectasis,BMI_breaks), fill=atelectasis),na.rm = TRUE) +
  scale_fill_manual(values=c("grey95","lightsteelblue4")) +
  theme_mosaic() + 
  theme(axis.text.x=element_text(size=rel(0.8)))

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 92.149, df = 4, p-value < 2.2e-16

Atelectasis location by obesity class

Mean expected frequency:

mean_exp <- data_prop %>% drop_na(BMI_breaks, atelectasis_location) %>% summarize(mean_expected_freq = n()/(nlevels(BMI_breaks)*nlevels(atelectasis_location)))
mean_exp
##   mean_expected_freq
## 1                7.7

Mean expected frequency is greater than 5.0, so chi-squared without continuity correction is adequate.

freq <- table(BMI_breaks, atelectasis_location)
freq
##           atelectasis_location
## BMI_breaks Unilateral Bilateral
##    [30,35)          7         1
##    [35,40)         10         5
##    [40,45)          3         4
##    [45,50)         11         4
##    ≥50             22        10
round(prop.table(freq),4)
##           atelectasis_location
## BMI_breaks Unilateral Bilateral
##    [30,35)     0.0909    0.0130
##    [35,40)     0.1299    0.0649
##    [40,45)     0.0390    0.0519
##    [45,50)     0.1429    0.0519
##    ≥50         0.2857    0.1299

Mosaic Plot

ggplot(data_prop = data_prop) +
  geom_mosaic(aes(x = product(BMI_breaks,atelectasis_location), fill=BMI_breaks),na.rm = TRUE) +
  scale_fill_manual(values=c("seagreen1","seagreen2","seagreen3","seagreen4","seagreen")) +
  theme_mosaic() 

chi <- chisq.test(freq, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  freq
## X-squared = 3.6755, df = 4, p-value = 0.4517
#Patients who had atelectasis (Yes/No)
atelectasis_total <- data_prop %>% group_by(atelectasis) %>% 
  summarise(n = n()) %>%  
  mutate(prev = n / sum(n)*100, 
         lower = lapply(n, prop.test, n = sum(n)), 
         upper = sapply(lower, function(x) x$conf.int[2])*100, 
         lower = sapply(lower, function(x) x$conf.int[1])*100,
         BMI_breaks = "Total") %>% 
  mutate_at(3:5, round, 2)
atelectasis_total$confint <- paste(atelectasis_total$lower, "-", atelectasis_total$upper)
atelectasis_total <- atelectasis_total %>% dplyr::select(-c(lower,upper))

atelectasis_obesity <- data_prop %>% group_by(BMI_breaks, atelectasis) %>% 
  summarise(n = n()) %>%  
  mutate(prev = n / sum(n)*100, 
         lower = lapply(n, prop.test, n = sum(n)), 
         upper = sapply(lower, function(x) x$conf.int[2])*100, 
         lower = sapply(lower, function(x) x$conf.int[1])*100) %>% 
  mutate_at(4:6, round, 2)
atelectasis_obesity$confint <- paste(atelectasis_obesity$lower, "-", atelectasis_obesity$upper)
atelectasis_obesity <- atelectasis_obesity %>% dplyr::select(-c(lower,upper))


atelectasis <- atelectasis_total %>% bind_rows(atelectasis_obesity) %>% relocate(BMI_breaks, .before = n)


#Location of atelectasis (Unilateral/Bilateral)
atelectasis_total_location <- data_prop %>% filter(atelectasis=="Yes") %>% group_by(atelectasis_location) %>% summarise(Freq = n()) %>%  mutate(Percentage = (round((Freq/sum(Freq)*100),digits=2)), BMI_breaks = "Total")
atelectasis_total_location$sumpercent <- paste(atelectasis_total_location$Freq," (", atelectasis_total_location$Percentage, "%)")
atelectasis_total_location <- atelectasis_total_location %>% dplyr::select(-c(Freq,Percentage)) %>% pivot_wider(names_from = atelectasis_location,values_from = sumpercent)
                                                                                                                         
atelectasis_obesity_location <- data_prop %>% filter(atelectasis=="Yes") %>% group_by(BMI_breaks, atelectasis_location) %>% summarise(Freq = n()) %>%  mutate(Percentage = (round((Freq/sum(Freq)*100),digits=2)))
atelectasis_obesity_location$sumpercent <- paste(atelectasis_obesity_location$Freq," (", atelectasis_obesity_location$Percentage, "%)")
atelectasis_obesity_location <- atelectasis_obesity_location %>% dplyr::select(-c(Freq,Percentage)) %>% pivot_wider(names_from = atelectasis_location,values_from = sumpercent)

location <- atelectasis_total_location %>% bind_rows(atelectasis_obesity_location)

atelectasis <- atelectasis %>% right_join(location,by="BMI_breaks") %>% mutate(confint=replace(confint, atelectasis=="No", NA), Unilateral=replace(Unilateral, atelectasis=="No", NA), Bilateral=replace(Bilateral, atelectasis=="No", NA))
atelectasis
## # A tibble: 11 × 7
##    atelectasis BMI_breaks     n  prev confint       Unilateral     Bilateral    
##    <fct>       <chr>      <int> <dbl> <chr>         <chr>          <chr>        
##  1 Yes         Total         77  32.6 26.77 - 39.06 53  ( 68.83 %) 24  ( 31.17 …
##  2 No          Total        159  67.4 <NA>          <NA>           <NA>         
##  3 Yes         [30,35)        8  12.7 6.03 - 24.04  7  ( 87.5 %)   1  ( 12.5 %) 
##  4 No          [30,35)       55  87.3 <NA>          <NA>           <NA>         
##  5 Yes         [35,40)       15  28.3 17.2 - 42.56  10  ( 66.67 %) 5  ( 33.33 %)
##  6 No          [35,40)       38  71.7 <NA>          <NA>           <NA>         
##  7 Yes         [40,45)        7  12.3 5.49 - 24.29  3  ( 42.86 %)  4  ( 57.14 %)
##  8 No          [40,45)       50  87.7 <NA>          <NA>           <NA>         
##  9 Yes         [45,50)       15  48.4 30.56 - 66.6  11  ( 73.33 %) 4  ( 26.67 %)
## 10 No          [45,50)       16  51.6 <NA>          <NA>           <NA>         
## 11 Yes         ≥50           32 100   86.66 - 100   22  ( 68.75 %) 10  ( 31.25 …

Atelectasis Percent

Mean atelectasis percentage

Mean atelectasis percentage coverage by BMI breaks:

If inoring zeo-inflation and skewness:

data %>% mutate(BMI_breaks = cut(BMI,breaks=c(30,35,40,45,50,80),right=FALSE,
                                     labels=c("[30,35)","[35,40)","[40,45)","[45,50)","≥50")),
                    atelectasis_percent=as.numeric(atelectasis_percent)) %>% 
  group_by(BMI_breaks) %>%
  summarize(
    mean = mean(atelectasis_percent),
    sd = sd(atelectasis_percent)
  ) 
## # A tibble: 5 × 3
##   BMI_breaks   mean    sd
##   <fct>       <dbl> <dbl>
## 1 [30,35)     0.913  2.89
## 2 [35,40)     1.56   3.15
## 3 [40,45)     0.702  2.05
## 4 [45,50)     3.63   4.22
## 5 ≥50        10.5    5.40

As is evident from these numbers, assuming normality causes standard deviation to capture negative values, which is impossible in reality for this variable.

Thus, bootstrapping mean and 95%CI is expected to lead to more appropriate estimates:

data_class_1 <- data %>% 
  mutate(BMI_breaks = cut(BMI,breaks=c(30,35,40,45,50,80),right=FALSE,
                                     labels=c("[30,35)","[35,40)","[40,45)","[45,50)","≥50")),
         atelectasis_percent=as.numeric(atelectasis_percent)) %>% 
  group_by(BMI_breaks) %>%
  filter(BMI_breaks=="[40,45)") 


data_class_2 <- data %>% 
  mutate(BMI_breaks = cut(BMI,breaks=c(30,35,40,45,50,80),right=FALSE,
                                     labels=c("[30,35)","[35,40)","[40,45)","[45,50)","≥50")),
         atelectasis_percent=as.numeric(atelectasis_percent)) %>% 
  group_by(BMI_breaks) %>% 
  filter(BMI_breaks=="[45,50)") 


data_class_3 <- data %>% 
  mutate(BMI_breaks = cut(BMI,breaks=c(30,35,40,45,50,80),right=FALSE,
                                     labels=c("[30,35)","[35,40)","[40,45)","[45,50)","≥50")),
         atelectasis_percent=as.numeric(atelectasis_percent)) %>% 
  group_by(BMI_breaks) %>% 
  filter(BMI_breaks=="≥50") 

BMI [40,45)

boot_class1 <- one.boot(data_class_1$atelectasis_percent, mean, R=10000)
mean_boot_class1 <- mean(boot_class1$t)
mean_boot_class1
## [1] 0.7016447
boot_ci_class1 <- boot.ci(boot_class1)
boot_ci_class1
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_class1)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 0.1738,  1.2300 )   ( 0.1316,  1.1842 )  
## 
## Level     Percentile            BCa          
## 95%   ( 0.2193,  1.2719 )   ( 0.2193,  1.2719 )  
## Calculations and Intervals on Original Scale

BMI [44,50)

boot_class2 <- one.boot(data_class_2$atelectasis_percent, mean, R=10000)
mean_boot_class2 <- mean(boot_class2$t)
mean_boot_class2
## [1] 3.620282
boot_ci_class2 <- boot.ci(boot_class2)
boot_ci_class2
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_class2)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 2.174,  5.102 )   ( 2.097,  5.081 )  
## 
## Level     Percentile            BCa          
## 95%   ( 2.177,  5.161 )   ( 2.177,  5.081 )  
## Calculations and Intervals on Original Scale
  • BMI >50
boot_class3 <- one.boot(data_class_3$atelectasis_percent, mean, R=10000)
mean_boot_class3 <- mean(boot_class3$t)
mean_boot_class3
## [1] 10.47598
boot_ci_class3 <- boot.ci(boot_class3)
boot_ci_class3
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_class3)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   ( 8.62, 12.30 )   ( 8.52, 12.19 )  
## 
## Level     Percentile            BCa          
## 95%   ( 8.75, 12.42 )   ( 8.67, 12.34 )  
## Calculations and Intervals on Original Scale

The mean atelectasis percentage according to class 3 subgroups was: BMI [40,45) (0.7%, 95%CI:0.22-1.27), BMI [44,50) (3.62%, 95%CI:2.18-5.08), and BMI >50 (10.48%, 95%CI:8.67-12.34).

Mean expected frequency:

mean_exp <- data_prop %>% mutate(atelectasis_percent=factor(atelectasis_percent)) %>% summarize(mean_expected_freq = n()/(nlevels(BMI_breaks)*nlevels(atelectasis_percent)))
mean_exp
##   mean_expected_freq
## 1               11.8

Mean expected frequency is greater than 5.0, so chi-squared without continuity correction is adequate.

tab <- table(atelectasis_percent,type_obesity)
tab
##                    type_obesity
## atelectasis_percent  1  2  3
##                 0%  56 45 69
##                 5%   5  6 36
##                 10%  0  1  6
##                 15%  1  1 10
prop.table(tab)
##                    type_obesity
## atelectasis_percent           1           2           3
##                 0%  0.237288136 0.190677966 0.292372881
##                 5%  0.021186441 0.025423729 0.152542373
##                 10% 0.000000000 0.004237288 0.025423729
##                 15% 0.004237288 0.004237288 0.042372881
prop_fig2a <- prop.table(tab,margin=2)
prop_fig2a
##                    type_obesity
## atelectasis_percent          1          2          3
##                 0%  0.90322581 0.84905660 0.57024793
##                 5%  0.08064516 0.11320755 0.29752066
##                 10% 0.00000000 0.01886792 0.04958678
##                 15% 0.01612903 0.01886792 0.08264463
tab <- table(atelectasis_percent,BMI_breaks)
tab
##                    BMI_breaks
## atelectasis_percent [30,35) [35,40) [40,45) [45,50) ≥50
##                 0%       57      45      52      16   0
##                 5%        5       6       5      12  19
##                 10%       0       1       0       2   4
##                 15%       1       1       0       1   9
prop.table(tab)
##                    BMI_breaks
## atelectasis_percent     [30,35)     [35,40)     [40,45)     [45,50)         ≥50
##                 0%  0.241525424 0.190677966 0.220338983 0.067796610 0.000000000
##                 5%  0.021186441 0.025423729 0.021186441 0.050847458 0.080508475
##                 10% 0.000000000 0.004237288 0.000000000 0.008474576 0.016949153
##                 15% 0.004237288 0.004237288 0.000000000 0.004237288 0.038135593
prop_fig2b <- prop.table(tab,margin=2)
prop_fig2b
##                    BMI_breaks
## atelectasis_percent    [30,35)    [35,40)    [40,45)    [45,50)        ≥50
##                 0%  0.90476190 0.84905660 0.91228070 0.51612903 0.00000000
##                 5%  0.07936508 0.11320755 0.08771930 0.38709677 0.59375000
##                 10% 0.00000000 0.01886792 0.00000000 0.06451613 0.12500000
##                 15% 0.01587302 0.01886792 0.00000000 0.03225806 0.28125000
barplot(tab,beside=TRUE)

chi <- chisq.test(tab, correct=FALSE)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 126.47, df = 12, p-value < 2.2e-16

The relative frequencies of atelectasis percentage according to BMI breaks were significantly different (Figure 2b, p<0.001).

Figure 2b
barplot(prop_fig2b,beside=TRUE,ylim=c(0,1),xlim=c(0,34),ylab="Relative frequency",xlab="Body mass index (kg/m²)",
        col=brewer.pal(4,"Blues"),
        legend.text=c("0-5%","5-10%","10-15%","≥15%"),
        space = c(0.2, 1.5)
        )

Figure2b <- recordPlot()
png(filename=paste(figfolder,"/Figure2b.png",sep=""),width=8, height=5, units="in", res=300)
Figure2b
dev.off

I only want to show frequencies for the subcategories of class 3 obesity.

prop_fig2b <- prop_fig2b[1:4,3:5]

Rebuild plots and stack them:

layout(matrix(c(1,2), ncol=2), widths=c(5,5))
par(mgp=c(0,2,0))  
barplot(prop_fig2a,beside=TRUE,ylim=c(0,1),yaxt='n',
        main="A",adj=0,
        names.arg=expression(atop("[30,35)","Class 1"),atop("[35,40)","Class 2"),atop("≥40","Class 3")),
        col=brewer.pal(4,"Blues"),
        space = c(0.2, 1.5), 
        cex.axis = 0.9, cex.names = 0.9
        )
axis(2, at=0.5, pos=0, labels="Relative frequency", las=0, tck=0, lwd=0)
axis(2, at=c(0.0,0.2,0.4,0.6,0.8,1.0), labels=FALSE)
axis(2, at=c(0.0,0.2,0.4,0.6,0.8,1.0), labels=c("0.0","0.2","0.4","0.6","0.8","1.0"), pos=3, tck=0, lwd=0, cex.axis=0.9)
par(mgp=c(2,0.5,0)) 
barplot(prop_fig2b,beside=TRUE,ylim=c(0,1),
        main="B",adj=0,
        xlab="         Class 3 subgroups (kg/m²)", cex.lab = 0.9,
        col=brewer.pal(4,"Blues"),
        legend.text=c("0-5%","5-10%","10-15%","≥15%"), args.legend = c(y=1.1,cex=0.8),
        space = c(0.2, 1.5), 
        cex.axis = 0.9, cex.names = 0.9
        )

Figure2 <- recordPlot() 
png(filename=paste(figfolder,"/Figure2.png",sep=""),width=8, height=5, units="in", res=300)
Figure2
dev.off

Crude prevalence ratio

dataprev <- data_prop %>% mutate(atelectasis = as.numeric(as.character(fct_recode(atelectasis,"0"="No","1"="Yes"))))

poisson_fit <- glm(atelectasis ~ BMI_breaks,
data = dataprev,
family = poisson(link = log))

tidy(poisson_fit, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 5 × 7
##   term              estimate std.error statistic      p.value conf.low conf.high
##   <chr>                <dbl>     <dbl>     <dbl>        <dbl>    <dbl>     <dbl>
## 1 (Intercept)          0.127     0.354   -5.84        5.31e-9   0.0580     0.236
## 2 BMI_breaks[35,40)    2.23      0.438    1.83        6.72e-2   0.968      5.54 
## 3 BMI_breaks[40,45)    0.967     0.518   -0.0646      9.48e-1   0.339      2.69 
## 4 BMI_breaks[45,50)    3.81      0.438    3.06        2.25e-3   1.66       9.46 
## 5 BMI_breaks≥50        7.87      0.395    5.22        1.78e-7   3.82      18.4

Robust standard errors:

covmat <- vcovHC(poisson_fit, type = "HC0")
covmat
##                   (Intercept) BMI_breaks[35,40) BMI_breaks[40,45)
## (Intercept)          0.109127        -0.1091270        -0.1091270
## BMI_breaks[35,40)   -0.109127         0.1569257         0.1091270
## BMI_breaks[40,45)   -0.109127         0.1091270         0.2344403
## BMI_breaks[45,50)   -0.109127         0.1091270         0.1091270
## BMI_breaks≥50       -0.109127         0.1091270         0.1091270
##                   BMI_breaks[45,50) BMI_breaks≥50
## (Intercept)              -0.1091270     -0.109127
## BMI_breaks[35,40)         0.1091270      0.109127
## BMI_breaks[40,45)         0.1091270      0.109127
## BMI_breaks[45,50)         0.1435356      0.109127
## BMI_breaks≥50             0.1091270      0.109127
#Calculate the standard error
se <- sqrt(diag(covmat))

# Bind together model output
#  1. exponentiated coefficients
#  2. robust standard errors
#  3. 95% confidence intervals
# Note that qnorm(0.975) approximately equals 1.96
model_output <- cbind(
  Estimate = exp(coef(poisson_fit)),
  `Robust SE` = se,
  Lower = exp(coef(poisson_fit) - qnorm(0.975) * se),
  Upper = exp(coef(poisson_fit) + qnorm(0.975) * se)
)

# Coerce model_output into a data frame
# Return second row to focus on the weekheart variable
model_output <- as.data.frame(round(model_output,2))
model_output
##                   Estimate Robust SE Lower Upper
## (Intercept)           0.13      0.33  0.07  0.24
## BMI_breaks[35,40)     2.23      0.40  1.03  4.84
## BMI_breaks[40,45)     0.97      0.48  0.37  2.50
## BMI_breaks[45,50)     3.81      0.38  1.81  8.01
## BMI_breaks≥50         7.87      0.33  4.12 15.05

Adjusted prevalence ratio

poisson_fit <- glm(atelectasis ~ BMI_breaks + age + sex + sleep_apnea,
data = dataprev,
family = poisson(link = log))

tidy(poisson_fit, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 8 × 7
##   term              estimate std.error statistic    p.value conf.low conf.high
##   <chr>                <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 (Intercept)          0.109    0.616     -3.60  0.000319     0.0310     0.349
## 2 BMI_breaks[35,40)    2.17     0.439      1.76  0.0779       0.940      5.39 
## 3 BMI_breaks[40,45)    0.909    0.521     -0.184 0.854        0.317      2.55 
## 4 BMI_breaks[45,50)    3.35     0.443      2.73  0.00633      1.44       8.39 
## 5 BMI_breaks≥50        7.01     0.403      4.83  0.00000135   3.34      16.5  
## 6 age                  1.00     0.0120     0.211 0.833        0.979      1.03 
## 7 sexMan               0.877    0.375     -0.351 0.725        0.399      1.75 
## 8 sleep_apneaYes       2.73     0.307      3.27  0.00109      1.45       4.86

Robust standard errors:

covmat <- vcovHC(poisson_fit, type = "HC0")
covmat
##                    (Intercept) BMI_breaks[35,40) BMI_breaks[40,45)
## (Intercept)        0.177554148      -0.102924823     -0.0877845754
## BMI_breaks[35,40) -0.102924823       0.141303486      0.0976206338
## BMI_breaks[40,45) -0.087784575       0.097620634      0.1992085966
## BMI_breaks[45,50) -0.102133326       0.101540303      0.0976068644
## BMI_breaks≥50     -0.107113830       0.100485122      0.0952267231
## age               -0.001925166       0.000117901     -0.0002658152
## sexMan            -0.001046827       0.009748158      0.0035840805
## sleep_apneaYes     0.003033885      -0.014501059      0.0046586761
##                   BMI_breaks[45,50) BMI_breaks≥50           age       sexMan
## (Intercept)            -0.102133326 -0.1071138302 -1.925166e-03 -0.001046827
## BMI_breaks[35,40)       0.101540303  0.1004851223  1.179010e-04  0.009748158
## BMI_breaks[40,45)       0.097606864  0.0952267231 -2.658152e-04  0.003584080
## BMI_breaks[45,50)       0.134135903  0.1017412423  1.057530e-04  0.004249592
## BMI_breaks≥50           0.101741242  0.1047868860  1.934062e-04  0.001146054
## age                     0.000105753  0.0001934062  4.696052e-05 -0.000138888
## sexMan                  0.004249592  0.0011460540 -1.388880e-04  0.073608454
## sleep_apneaYes         -0.022316208 -0.0021893547 -5.242813e-06 -0.023867442
##                   sleep_apneaYes
## (Intercept)         3.033885e-03
## BMI_breaks[35,40)  -1.450106e-02
## BMI_breaks[40,45)   4.658676e-03
## BMI_breaks[45,50)  -2.231621e-02
## BMI_breaks≥50      -2.189355e-03
## age                -5.242813e-06
## sexMan             -2.386744e-02
## sleep_apneaYes      4.567423e-02
#Calculate the standard error
se <- sqrt(diag(covmat))

# Bind together model output
#  1. exponentiated coefficients
#  2. robust standard errors
#  3. 95% confidence intervals
# Note that qnorm(0.975) approximately equals 1.96
model_output2 <- cbind(
  Estimate = exp(coef(poisson_fit)),
  `Robust SE` = se,
  Lower = exp(coef(poisson_fit) - qnorm(0.975) * se),
  Upper = exp(coef(poisson_fit) + qnorm(0.975) * se)
)

# Coerce model_output into a data frame
# Return second row to focus on the weekheart variable
model_output2 <- as.data.frame(round(model_output2,2))
model_output2
##                   Estimate Robust SE Lower Upper
## (Intercept)           0.11      0.42  0.05  0.25
## BMI_breaks[35,40)     2.17      0.38  1.04  4.53
## BMI_breaks[40,45)     0.91      0.45  0.38  2.18
## BMI_breaks[45,50)     3.35      0.37  1.63  6.87
## BMI_breaks≥50         7.01      0.32  3.72 13.22
## age                   1.00      0.01  0.99  1.02
## sexMan                0.88      0.27  0.52  1.49
## sleep_apneaYes        2.73      0.21  1.80  4.15

Table 2

model_output <- model_output %>% 
  dplyr::slice(2:5) %>%
  rename(PR = Estimate, SE = `Robust SE`) %>% 
  mutate("95%CI" = paste(Lower,Upper,sep="-")) %>% 
  select(-c(Lower,Upper))

model_output2 <- model_output2 %>% 
  dplyr::slice(2:5) %>%
  rename(aPR = Estimate, aSE = `Robust SE`) %>% 
  mutate("a95%CI" = paste(Lower,Upper,sep="-")) %>% 
  select(-c(Lower,Upper))

table2 <- dplyr::bind_cols(model_output, model_output2) %>% rownames_to_column(var = "Category") %>% mutate_at("Category", str_replace, "BMI_breaks", "")

flextable(table2) %>% 
  save_as_docx(path = paste(tabfolder,"/Table2_complement.docx",sep=""))

table2
##   Category   PR   SE      95%CI  aPR  aSE     a95%CI
## 1  [35,40) 2.23 0.40  1.03-4.84 2.17 0.38  1.04-4.53
## 2  [40,45) 0.97 0.48   0.37-2.5 0.91 0.45  0.38-2.18
## 3  [45,50) 3.81 0.38  1.81-8.01 3.35 0.37  1.63-6.87
## 4      ≥50 7.87 0.33 4.12-15.05 7.01 0.32 3.72-13.22

Ordinal Logistic Regression Model

Collapse atelectasis percent category as done before:

dataposthoc <- data_prop
dataposthoc$atelectasis_percent <- fct_collapse(dataposthoc$atelectasis_percent,
                                              "0%" = c("0%","2.5%"),
                                              "5%" = c("5%","7.5%"),
                                              "10%" = c("10%","12.5%"),
                                              "15%" = c("15%","17.5%")
                                              )

table(dataposthoc$atelectasis_percent)
## 
##  0%  5% 10% 15% 
## 170  47   7  12

Rename new obesity category breaks levels to facilitate reading of results:

dataposthoc <- dataposthoc %>% 
  mutate(BMI_breaks=fct_recode(BMI_breaks, 
                               "1"="[30,35)", 
                               "2"="[35,40)", 
                               "3"="[40,45)",
                               "4"="[45,50)",
                               "5"="≥50"),
         atelectasis_percent=ordered(atelectasis_percent)) 

Visualize pattern of atelectasis percent increase by obesity type category.

dataposthoc %>% 
  ggplot(aes(x = BMI_breaks, fill = atelectasis_percent)) + 
  geom_bar() +
  scale_fill_manual(values = brewer.pal(5,"Blues")) +
  theme_minimal()

The loglog link function will be used since this distribution provides the best fit for the fully adjusted ordinal model:

dataposthoc %>%
  nest(data = everything()) %>%
 crossing(
    link = c("logit", "probit", "cloglog", "loglog", "cauchit"),
    threshold = c("flexible", "equidistant")
  ) %>%
  mutate(
    mod = map2(
      data, link,
      ~clm(atelectasis_percent ~ BMI_breaks + sex + age + sleep_apnea, 
        data = .x, link = .y
      )
    )
  ) %>%
  mutate(
    mod_summary = map(
      mod,
      ~glance(.x) %>% mutate(logLik = as.numeric(logLik))
    )
  ) %>%
  unnest(mod_summary) %>%
  dplyr::select(link, threshold, logLik, edf, AIC, BIC) %>%
  arrange(logLik) %>%
  gt()
link threshold logLik edf AIC BIC
cloglog equidistant -136.2562 10 292.5125 327.1508
cloglog flexible -136.2562 10 292.5125 327.1508
probit equidistant -124.5685 10 269.1371 303.7754
probit flexible -124.5685 10 269.1371 303.7754
cauchit equidistant -121.7896 10 263.5792 298.2175
cauchit flexible -121.7896 10 263.5792 298.2175
logit equidistant -121.1829 10 262.3658 297.0041
logit flexible -121.1829 10 262.3658 297.0041
loglog equidistant -118.9089 10 257.8178 292.4561
loglog flexible -118.9089 10 257.8178 292.4561
fit_uni <- clm(atelectasis_percent ~ BMI_breaks, data = dataposthoc, link = "loglog", threshold = "flexible")

summary(fit_uni)
## formula: atelectasis_percent ~ BMI_breaks
## data:    dataposthoc
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -130.96 275.92 7(0)  6.07e-07 8.2e+01
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## BMI_breaks2  0.49869    0.54030   0.923    0.356    
## BMI_breaks3 -0.09443    0.60569  -0.156    0.876    
## BMI_breaks4  1.89692    0.48495   3.912 9.17e-05 ***
## BMI_breaks5  3.86819    0.53167   7.276 3.45e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     2.2993     0.4084   5.631
## 5%|10%    4.4187     0.5231   8.448
## 10%|15%   4.9582     0.5550   8.933

Note that AIC (275.92) improved very much respect to initial univariable model with conventional BMI categories (391.33).

nominal_test(fit_uni, trace=TRUE)
## trying + BMI_breaks
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ BMI_breaks
##            Df  logLik    AIC LRT Pr(>Chi)
## <none>        -130.96 275.92             
## BMI_breaks

Traceback of nominal_test function showed failure to converge (error code -1), similar to what happened with first model. Will fit model with nominal term and compare through ANOVA:

fit_BMInom <- clm(atelectasis_percent ~ 1, nominal= ~ BMI_breaks, data = dataposthoc, link = "loglog", threshold = "flexible")
## Warning: (-1) Model failed to converge with max|grad| = 190.071 (tol = 1e-06) 
## In addition: maximum number of consecutive Newton modifications reached
summary(fit_BMInom)
## formula: atelectasis_percent ~ 1
## nominal: ~BMI_breaks
## data:    dataposthoc
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H  
##  loglog flexible  236  -136.78 303.55 4(0)  1.90e+02 -1.2e+20
## 
## Threshold coefficients:
##                     Estimate Std. Error z value
## 0%|5%.(Intercept)     1.8814         NA      NA
## 5%|10%.(Intercept)    3.4172         NA      NA
## 10%|15%.(Intercept)   1.3263         NA      NA
## 0%|5%.BMI_breaks2    -0.1783         NA      NA
## 5%|10%.BMI_breaks2   -0.3933         NA      NA
## 10%|15%.BMI_breaks2   2.2080         NA      NA
## 0%|5%.BMI_breaks3     0.2068         NA      NA
## 5%|10%.BMI_breaks3    1.2968         NA      NA
## 10%|15%.BMI_breaks3   0.0000         NA      NA
## 0%|5%.BMI_breaks4    -1.4306         NA      NA
## 5%|10%.BMI_breaks4   -1.0649         NA      NA
## 10%|15%.BMI_breaks4   1.9925         NA      NA
## 0%|5%.BMI_breaks5    -3.5989         NA      NA
## 5%|10%.BMI_breaks5   -2.6380         NA      NA
## 10%|15%.BMI_breaks5  -0.5268         NA      NA
anova(fit_uni,fit_BMInom)
## Likelihood ratio tests of cumulative link models:
##  
##            formula:                         nominal:    link:  threshold:
## fit_uni    atelectasis_percent ~ BMI_breaks ~1          loglog flexible  
## fit_BMInom atelectasis_percent ~ 1          ~BMI_breaks loglog flexible  
## 
##            no.par    AIC  logLik LR.stat df Pr(>Chisq)
## fit_uni         7 275.92 -130.96                      
## fit_BMInom     15 303.55 -136.78 -11.629  8          1

Model with nominal term did not converge. Thus, I will only present ordinal model.

uni_BMI_breaks <- tidy(fit_uni, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
   cols_label(conf.low = "95%CI")

uni_BMI_breaks
term estimate 95%CI
0%|5% 9.97 4.48—22.19
5%|10% 82.98 29.77—231.32
10%|15% 142.34 47.96—422.43
BMI_breaks2 1.65 0.57—4.75
BMI_breaks3 0.91 0.28—2.98
BMI_breaks4 6.67 2.58—17.24
BMI_breaks5 47.86 16.88—135.67

Multivariable model

fit_multi <- clm(atelectasis_percent ~ BMI_breaks + age + sex + sleep_apnea, data = dataposthoc, link = "loglog", threshold = "flexible")

summary(fit_multi)
## formula: atelectasis_percent ~ BMI_breaks + age + sex + sleep_apnea
## data:    dataposthoc
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -118.91 257.82 8(0)  5.95e-12 2.0e+05
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## BMI_breaks2     0.37446    0.54564   0.686 0.492540    
## BMI_breaks3    -0.26260    0.61977  -0.424 0.671779    
## BMI_breaks4     1.82476    0.50148   3.639 0.000274 ***
## BMI_breaks5     4.00462    0.55550   7.209 5.64e-13 ***
## age             0.01379    0.01500   0.920 0.357708    
## sexMan         -0.24020    0.46488  -0.517 0.605375    
## sleep_apneaYes  2.04946    0.40905   5.010 5.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     2.9807     0.7691   3.875
## 5%|10%    5.3624     0.8786   6.103
## 10%|15%   5.9524     0.9054   6.574

Convergence:

convergence(fit_multi)
##  nobs logLik  niter max.grad cond.H  logLik.Error
##  236  -118.91 8(0)  5.95e-12 2.0e+05 <1e-10      
## 
##                Estimate Std.Err  Gradient     Error Cor.Dec Sig.Dig
## 0%|5%           2.98075  0.7691 -1.37e-13  5.91e-16      14      15
## 5%|10%          5.36244  0.8786 -3.22e-15  1.94e-16      15      16
## 10%|15%         5.95239  0.9054  3.33e-15  3.19e-16      15      16
## BMI_breaks2     0.37446  0.5456 -3.62e-15 -1.71e-15      14      14
## BMI_breaks3    -0.26260  0.6198  1.40e-13  2.72e-14      13      13
## BMI_breaks4     1.82476  0.5015 -4.44e-16 -1.25e-15      14      15
## BMI_breaks5     4.00462  0.5555 -6.83e-15 -1.64e-15      14      15
## age             0.01379  0.0150  5.95e-12  4.81e-17      16      15
## sexMan         -0.24020  0.4649  3.45e-14 -7.36e-16      14      14
## sleep_apneaYes  2.04946  0.4091  3.64e-14 -4.38e-16      15      16
## 
## Eigen values of Hessian:
## 8.306e+04 5.147e+01 2.511e+01 1.092e+01 1.037e+01 7.555e+00 5.777e+00 3.467e+00 1.965e+00 4.067e-01 
## 
## Convergence message from clm:
## (0) successful convergence 
## In addition: Absolute and relative convergence criteria were met

Check proportional odds assumption:

nominal_test(fit_multi)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ BMI_breaks + age + sex + sleep_apnea
##             Df  logLik    AIC    LRT Pr(>Chi)
## <none>         -118.91 257.82                
## BMI_breaks                                   
## age                                          
## sex          2 -117.68 259.36 2.4613   0.2921
## sleep_apnea  2 -118.01 260.02 1.7984   0.4069

Proportional odds OK for all variables except obesity type (as already expected) and age. Centering of age should solve this issue:

dataposthoc <- dataposthoc %>% mutate(agec = age/mean(age))

Re-fit model:

fit_multi2 <- clm(atelectasis_percent ~ BMI_breaks + agec + sex + sleep_apnea, data = dataposthoc, link = "loglog", threshold = "flexible")

summary(fit_multi2)
## formula: atelectasis_percent ~ BMI_breaks + agec + sex + sleep_apnea
## data:    dataposthoc
## 
##  link   threshold nobs logLik  AIC    niter max.grad cond.H 
##  loglog flexible  236  -118.91 257.82 8(0)  1.58e-13 2.5e+02
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## BMI_breaks2      0.3745     0.5456   0.686 0.492540    
## BMI_breaks3     -0.2626     0.6198  -0.424 0.671779    
## BMI_breaks4      1.8248     0.5015   3.639 0.000274 ***
## BMI_breaks5      4.0046     0.5555   7.209 5.64e-13 ***
## agec             0.5553     0.6037   0.920 0.357708    
## sexMan          -0.2402     0.4649  -0.517 0.605375    
## sleep_apneaYes   2.0495     0.4091   5.010 5.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##         Estimate Std. Error z value
## 0%|5%     2.9807     0.7691   3.875
## 5%|10%    5.3624     0.8786   6.103
## 10%|15%   5.9524     0.9054   6.574

Convergence:

convergence(fit_multi2)
##  nobs logLik  niter max.grad cond.H  logLik.Error
##  236  -118.91 8(0)  1.58e-13 2.5e+02 <1e-10      
## 
##                Estimate Std.Err  Gradient     Error Cor.Dec Sig.Dig
## 0%|5%            2.9807  0.7691 -1.56e-13 -6.22e-15      13      14
## 5%|10%           5.3624  0.8786  1.84e-14 -6.57e-15      13      14
## 10%|15%          5.9524  0.9054 -2.29e-14 -7.65e-15      13      14
## BMI_breaks2      0.3745  0.5456  0.00e+00 -3.27e-15      14      14
## BMI_breaks3     -0.2626  0.6198  1.39e-13  2.48e-14      13      13
## BMI_breaks4      1.8248  0.5015 -4.44e-16 -3.53e-15      14      15
## BMI_breaks5      4.0046  0.5555  1.89e-15 -3.85e-15      14      15
## agec             0.5553  0.6037  1.58e-13 -2.65e-15      14      14
## sexMan          -0.2402  0.4649  3.90e-14  2.04e-18      17      17
## sleep_apneaYes   2.0495  0.4091  3.94e-14 -9.18e-16      14      15
## 
## Eigen values of Hessian:
## 93.2384 51.4260 24.1693 10.9191 10.1386  6.9042  4.8502  3.4383  1.6303  0.3765 
## 
## Convergence message from clm:
## (0) successful convergence 
## In addition: Absolute and relative convergence criteria were met

Check proportional odds assumption:

nominal_test(fit_multi2)
## Tests of nominal effects
## 
## formula: atelectasis_percent ~ BMI_breaks + agec + sex + sleep_apnea
##             Df  logLik    AIC    LRT Pr(>Chi)
## <none>         -118.91 257.82                
## BMI_breaks                                   
## agec                                         
## sex          2 -117.68 259.36 2.4613   0.2921
## sleep_apnea  2 -118.01 260.02 1.7984   0.4069

Issue not solved. Will report uncentered age as proportional odds assumption is not as relevant for covariates.

Check for scale effects:

scale_test(fit_multi2)
## Tests of scale effects
## 
## formula: atelectasis_percent ~ BMI_breaks + agec + sex + sleep_apnea
##             Df  logLik    AIC    LRT Pr(>Chi)
## <none>         -118.91 257.82                
## BMI_breaks   4 -115.04 258.08 7.7382   0.1017
## agec         1 -118.82 259.64 0.1754   0.6754
## sex          1 -118.13 258.26 1.5572   0.2121
## sleep_apnea  1 -118.77 259.53 0.2877   0.5917

No scale effects.

Summarize results of first model in table:

tidy(fit_multi, conf.int = T, conf.type = "Wald") %>%
  transmute(
    term, across(c(estimate, conf.low, conf.high), exp)
  ) %>%
  gt() %>%
  fmt_number(c(estimate, conf.low, conf.high), decimals = 2) %>% 
  cols_merge(c(conf.low, conf.high),
             pattern = "{1}&mdash;{2}") %>% 
  cols_label(conf.low = "95%CI")
term estimate 95%CI
0%|5% 19.70 4.36—88.96
5%|10% 213.24 38.11—1,193.36
10%|15% 384.67 65.22—2,268.68
BMI_breaks2 1.45 0.50—4.24
BMI_breaks3 0.77 0.23—2.59
BMI_breaks4 6.20 2.32—16.57
BMI_breaks5 54.85 18.46—162.94
age 1.01 0.98—1.04
sexMan 0.79 0.32—1.96
sleep_apneaYes 7.76 3.48—17.31

Partial proportional odds model

The probit link function will be used since this distribution provides the best fit for the fully adjusted model with nominal BMI terms:

dataposthoc %>%
  nest(data = everything()) %>%
 crossing(
    link = c("logit", "probit", "cloglog", "loglog", "cauchit"),
    threshold = c("flexible", "equidistant")
  ) %>%
  mutate(
    mod = map2(
      data, link,
      ~clm(atelectasis_percent ~ sex + age + sleep_apnea, nominal = ~ BMI_breaks,
        data = .x, link = .y
      )
    )
  ) %>%
  mutate(
    mod_summary = map(
      mod,
      ~glance(.x) %>% mutate(logLik = as.numeric(logLik))
    )
  ) %>%
  unnest(mod_summary) %>%
  dplyr::select(link, threshold, logLik, edf, AIC, BIC) %>%
  arrange(logLik) %>%
  gt()
link threshold logLik edf AIC BIC
cauchit equidistant -122.5310 18 281.0619 343.4109
cauchit flexible -122.5310 18 281.0619 343.4109
loglog equidistant -118.1566 18 272.3132 334.6622
loglog flexible -118.1566 18 272.3132 334.6622
logit equidistant -116.7621 18 269.5241 331.8731
logit flexible -116.7621 18 269.5241 331.8731
cloglog equidistant -115.4116 18 266.8231 329.1721
cloglog flexible -115.4116 18 266.8231 329.1721
probit equidistant -112.8364 18 261.6729 324.0218
probit flexible -112.8364 18 261.6729 324.0218

Multivariable model

Fit a model that allows proportional odds for all explanatory variables, except BMI breaks:

fit_BMIpart <- vglm(atelectasis_percent ~ BMI_breaks+sleep_apnea+sex+agec, data=dataposthoc, link="probit", family=cumulative(parallel=FALSE~BMI_breaks, reverse = TRUE))

summary(fit_BMIpart)
## 
## Call:
## vglm(formula = atelectasis_percent ~ BMI_breaks + sleep_apnea + 
##     sex + agec, family = cumulative(parallel = FALSE ~ BMI_breaks, 
##     reverse = TRUE), data = dataposthoc, link = "probit")
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1    -3.1556     0.9404  -3.356 0.000792 ***
## (Intercept):2    -5.2392     1.3672  -3.832 0.000127 ***
## (Intercept):3    -5.2392     1.3672  -3.832 0.000127 ***
## BMI_breaks2:1     0.5447     0.6074   0.897 0.369884    
## BMI_breaks2:2     0.9595     1.3094   0.733 0.463684    
## BMI_breaks2:3     0.1719     1.4878   0.116 0.908037    
## BMI_breaks3:1    -0.3682     0.7025  -0.524 0.600222    
## BMI_breaks3:2   -15.8638  1886.9705      NA       NA    
## BMI_breaks3:3   -16.2682  2309.8837      NA       NA    
## BMI_breaks4:1     2.1480     0.5975   3.595 0.000325 ***
## BMI_breaks4:2     1.4814     1.2966   1.142 0.253253    
## BMI_breaks4:3     0.2223     1.5445   0.144 0.885558    
## BMI_breaks5:1    19.8276   816.4253      NA       NA    
## BMI_breaks5:2     3.9944     1.1612   3.440 0.000582 ***
## BMI_breaks5:3     3.3507     1.1756   2.850 0.004370 ** 
## sleep_apneaYes    2.6305     0.5908   4.452 8.49e-06 ***
## sexMan            0.0059     0.6166   0.010 0.992365    
## agec              0.6393     0.7689      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3]), 
## logitlink(P[Y>=4])
## 
## Residual deviance: 231.5156 on 690 degrees of freedom
## 
## Log-likelihood: -115.7578 on 690 degrees of freedom
## 
## Number of Fisher scoring iterations: 29 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):2', '(Intercept):3', 'BMI_breaks3:2', 'BMI_breaks3:3', 'BMI_breaks5:1', 'sleep_apneaYes', 'agec'
## 
## 
## Exponentiated coefficients:
##  BMI_breaks2:1  BMI_breaks2:2  BMI_breaks2:3  BMI_breaks3:1  BMI_breaks3:2 
##   1.724076e+00   2.610434e+00   1.187509e+00   6.919932e-01   1.289564e-07 
##  BMI_breaks3:3  BMI_breaks4:1  BMI_breaks4:2  BMI_breaks4:3  BMI_breaks5:1 
##   8.605832e-08   8.567477e+00   4.398903e+00   1.248940e+00   4.083510e+08 
##  BMI_breaks5:2  BMI_breaks5:3 sleep_apneaYes         sexMan           agec 
##   5.429348e+01   2.852206e+01   1.388102e+01   1.005917e+00   1.895157e+00
confint_BMIpart <- round(exp(confintvglm(fit_BMIpart,method = "wald")),2)
confint_BMIpart
##                2.5 % 97.5 %
## (Intercept):1   0.01   0.27
## (Intercept):2   0.00   0.08
## (Intercept):3   0.00   0.08
## BMI_breaks2:1   0.52   5.67
## BMI_breaks2:2   0.20  33.98
## BMI_breaks2:3   0.06  21.93
## BMI_breaks3:1   0.17   2.74
## BMI_breaks3:2   0.00    Inf
## BMI_breaks3:3   0.00    Inf
## BMI_breaks4:1   2.66  27.64
## BMI_breaks4:2   0.35  55.85
## BMI_breaks4:3   0.06  25.78
## BMI_breaks5:1   0.00    Inf
## BMI_breaks5:2   5.58 528.66
## BMI_breaks5:3   2.85 285.67
## sleep_apneaYes  4.36  44.19
## sexMan          0.30   3.37
## agec            0.42   8.55

Hauck-Donner effect leading to unreliable estumates. Thus, will not report this model.